// This file is part of AsmJit project <https://asmjit.com>
//
// See <asmjit/core.h> or LICENSE.md for license and copyright information
// SPDX-License-Identifier: Zlib

#include <asmjit/core.h>

#if !defined(ASMJIT_NO_X86)
#include <asmjit/x86.h>

#include <limits>
#include <stdio.h>
#include <string.h>

#include "asmjit_bench_codegen.h"
#include "../tests/asmjit_test_misc.h"

using namespace asmjit;

enum class InstForm {
  kReg,
  kMem
};

// Generates a long sequence of GP instructions.
template<typename Emitter>
static void generate_gp_sequence_internal(
  Emitter& cc,
  InstForm form,
  const x86::Gp& a, const x86::Gp& b, const x86::Gp& c, const x86::Gp& d) {

  cc.mov(a, 0xAAAAAAAA);
  cc.mov(b, 0xBBBBBBBB);
  cc.mov(c, 0xCCCCCCCC);
  cc.mov(d, 0xFFFFFFFF);

  if (form == InstForm::kReg) {
    cc.adc(a, b);
    cc.adc(b, c);
    cc.adc(c, d);
    cc.add(a, b);
    cc.add(b, c);
    cc.add(c, d);
    cc.and_(a, b);
    cc.and_(b, c);
    cc.and_(c, d);
    cc.bsf(a, b);
    cc.bsf(b, c);
    cc.bsf(c, d);
    cc.bsr(a, b);
    cc.bsr(b, c);
    cc.bsr(c, d);
    cc.bswap(a);
    cc.bswap(b);
    cc.bswap(c);
    cc.bt(a, b);
    cc.bt(b, c);
    cc.bt(c, d);
    cc.btc(a, b);
    cc.btc(b, c);
    cc.btc(c, d);
    cc.btr(a, b);
    cc.btr(b, c);
    cc.btr(c, d);
    cc.bts(a, b);
    cc.bts(b, c);
    cc.bts(c, d);
    cc.cmp(a, b);
    cc.cmovc(a, b);
    cc.cmp(b, c);
    cc.cmovc(b, c);
    cc.cmp(c, d);
    cc.cmovc(c, d);
    cc.dec(a);
    cc.dec(b);
    cc.dec(c);
    cc.imul(a, b);
    cc.imul(b, c);
    cc.imul(c, d);
    cc.movsx(a, b.r8_lo());
    cc.movsx(b, c.r8_lo());
    cc.movsx(c, d.r8_lo());
    cc.movzx(a, b.r8_lo());
    cc.movzx(b, c.r8_lo());
    cc.movzx(c, d.r8_lo());
    cc.neg(a);
    cc.neg(b);
    cc.neg(c);
    cc.not_(a);
    cc.not_(b);
    cc.not_(c);
    cc.or_(a, b);
    cc.or_(b, c);
    cc.or_(c, d);
    cc.sbb(a, b);
    cc.sbb(b, c);
    cc.sbb(c, d);
    cc.sub(a, b);
    cc.sub(b, c);
    cc.sub(c, d);
    cc.test(a, b);
    cc.test(b, c);
    cc.test(c, d);
    cc.xchg(a, b);
    cc.xchg(b, c);
    cc.xchg(c, d);
    cc.xor_(a, b);
    cc.xor_(b, c);
    cc.xor_(c, d);

    cc.rcl(a, c.r8_lo());
    cc.rcl(b, c.r8_lo());
    cc.rcl(d, c.r8_lo());
    cc.rcr(a, c.r8_lo());
    cc.rcr(b, c.r8_lo());
    cc.rcr(d, c.r8_lo());
    cc.rol(a, c.r8_lo());
    cc.rol(b, c.r8_lo());
    cc.rol(d, c.r8_lo());
    cc.ror(a, c.r8_lo());
    cc.ror(b, c.r8_lo());
    cc.ror(d, c.r8_lo());
    cc.shl(a, c.r8_lo());
    cc.shl(b, c.r8_lo());
    cc.shl(d, c.r8_lo());
    cc.shr(a, c.r8_lo());
    cc.shr(b, c.r8_lo());
    cc.shr(d, c.r8_lo());
    cc.sar(a, c.r8_lo());
    cc.sar(b, c.r8_lo());
    cc.sar(d, c.r8_lo());
    cc.shld(a, b, c.r8_lo());
    cc.shld(b, d, c.r8_lo());
    cc.shld(d, a, c.r8_lo());
    cc.shrd(a, b, c.r8_lo());
    cc.shrd(b, d, c.r8_lo());
    cc.shrd(d, a, c.r8_lo());

    cc.adcx(a, b);
    cc.adox(a, b);
    cc.adcx(b, c);
    cc.adox(b, c);
    cc.adcx(c, d);
    cc.adox(c, d);
    cc.andn(a, b, c);
    cc.andn(b, c, d);
    cc.andn(c, d, a);
    cc.bextr(a, b, c);
    cc.bextr(b, c, d);
    cc.bextr(c, d, a);
    cc.blsi(a, b);
    cc.blsi(b, c);
    cc.blsi(c, d);
    cc.blsmsk(a, b);
    cc.blsmsk(b, c);
    cc.blsmsk(c, d);
    cc.blsr(a, b);
    cc.blsr(b, c);
    cc.blsr(c, d);
    cc.bzhi(a, b, c);
    cc.bzhi(b, c, d);
    cc.bzhi(c, d, a);
    cc.lzcnt(a, b);
    cc.lzcnt(b, c);
    cc.lzcnt(c, d);
    cc.pdep(a, b, c);
    cc.pdep(b, c, d);
    cc.pdep(c, d, a);
    cc.pext(a, b, c);
    cc.pext(b, c, d);
    cc.pext(c, d, a);
    cc.popcnt(a, b);
    cc.popcnt(b, c);
    cc.popcnt(c, d);
    cc.rorx(a, b, 8);
    cc.rorx(b, c, 8);
    cc.rorx(c, d, 8);
    cc.sarx(a, b, c);
    cc.sarx(b, c, d);
    cc.sarx(c, d, a);
    cc.shlx(a, b, c);
    cc.shlx(b, c, d);
    cc.shlx(c, d, a);
    cc.shrx(a, b, c);
    cc.shrx(b, c, d);
    cc.shrx(c, d, a);
    cc.tzcnt(a, b);
    cc.tzcnt(b, c);
    cc.tzcnt(c, d);
  }
  else {
    uint32_t register_size = cc.register_size();
    x86::Mem m = x86::ptr(c, 0, register_size);
    x86::Mem m8 = x86::byte_ptr(c);

    cc.adc(a, m);
    cc.adc(b, m);
    cc.adc(c, m);
    cc.add(a, m);
    cc.add(b, m);
    cc.add(c, m);
    cc.and_(a, m);
    cc.and_(b, m);
    cc.and_(c, m);
    cc.bsf(a, m);
    cc.bsf(b, m);
    cc.bsf(c, m);
    cc.bsr(a, m);
    cc.bsr(b, m);
    cc.bsr(c, m);
    cc.bt(m, a);
    cc.bt(m, b);
    cc.bt(m, c);
    cc.btc(m, a);
    cc.btc(m, b);
    cc.btc(m, c);
    cc.btr(m, a);
    cc.btr(m, b);
    cc.btr(m, c);
    cc.bts(m, a);
    cc.bts(m, b);
    cc.bts(m, c);
    cc.cmp(a, m);
    cc.cmovc(a, m);
    cc.cmp(b, m);
    cc.cmovc(b, m);
    cc.cmp(c, m);
    cc.cmovc(c, m);
    cc.dec(m);
    cc.movsx(a, m8);
    cc.movsx(b, m8);
    cc.movsx(c, m8);
    cc.movzx(a, m8);
    cc.movzx(b, m8);
    cc.movzx(c, m8);
    cc.neg(m);
    cc.not_(m);
    cc.or_(a, m);
    cc.or_(b, m);
    cc.or_(c, m);
    cc.sbb(a, m);
    cc.sbb(b, m);
    cc.sbb(c, m);
    cc.sub(a, m);
    cc.sub(b, m);
    cc.sub(c, m);
    cc.test(m, a);
    cc.test(m, b);
    cc.test(m, c);
    cc.xchg(a, m);
    cc.xchg(b, m);
    cc.xchg(c, m);
    cc.xor_(a, m);
    cc.xor_(b, m);
    cc.xor_(c, m);

    cc.rcl(m, c.r8_lo());
    cc.rcr(m, c.r8_lo());
    cc.rol(m, c.r8_lo());
    cc.ror(m, c.r8_lo());
    cc.shl(m, c.r8_lo());
    cc.shr(m, c.r8_lo());
    cc.sar(m, c.r8_lo());
    cc.shld(m, b, c.r8_lo());
    cc.shld(m, d, c.r8_lo());
    cc.shld(m, a, c.r8_lo());
    cc.shrd(m, b, c.r8_lo());
    cc.shrd(m, d, c.r8_lo());
    cc.shrd(m, a, c.r8_lo());

    cc.adcx(a, m);
    cc.adox(a, m);
    cc.adcx(b, m);
    cc.adox(b, m);
    cc.adcx(c, m);
    cc.adox(c, m);
    cc.andn(a, b, m);
    cc.andn(b, c, m);
    cc.andn(c, d, m);
    cc.bextr(a, m, c);
    cc.bextr(b, m, d);
    cc.bextr(c, m, a);
    cc.blsi(a, m);
    cc.blsi(b, m);
    cc.blsi(c, m);
    cc.blsmsk(a, m);
    cc.blsmsk(b, m);
    cc.blsmsk(c, m);
    cc.blsr(a, m);
    cc.blsr(b, m);
    cc.blsr(c, m);
    cc.bzhi(a, m, c);
    cc.bzhi(b, m, d);
    cc.bzhi(c, m, a);
    cc.lzcnt(a, m);
    cc.lzcnt(b, m);
    cc.lzcnt(c, m);
    cc.pdep(a, b, m);
    cc.pdep(b, c, m);
    cc.pdep(c, d, m);
    cc.pext(a, b, m);
    cc.pext(b, c, m);
    cc.pext(c, d, m);
    cc.popcnt(a, m);
    cc.popcnt(b, m);
    cc.popcnt(c, m);
    cc.rorx(a, m, 8);
    cc.rorx(b, m, 8);
    cc.rorx(c, m, 8);
    cc.sarx(a, m, c);
    cc.sarx(b, m, d);
    cc.sarx(c, m, a);
    cc.shlx(a, m, c);
    cc.shlx(b, m, d);
    cc.shlx(c, m, a);
    cc.shrx(a, m, c);
    cc.shrx(b, m, d);
    cc.shrx(c, m, a);
    cc.tzcnt(a, m);
    cc.tzcnt(b, m);
    cc.tzcnt(c, m);
  }
}

static void generate_empty_function(BaseEmitter& emitter, bool emit_prolog_epilog) {
  using namespace asmjit::x86;

#ifndef ASMJIT_NO_COMPILER
  if (emitter.is_compiler()) {
    Compiler& cc = *emitter.as<Compiler>();

    Gp rv = cc.new_gp32("rv");

    cc.add_func(FuncSignature::build<uint32_t>());
    cc.mov(rv, 0);
    cc.ret(rv);
    cc.end_func();

    return;
  }
#endif

#ifndef ASMJIT_NO_BUILDER
  if (emitter.is_builder()) {
    Builder& cc = *emitter.as<Builder>();

    Gp rv = eax;

    if (emit_prolog_epilog) {
      FuncDetail func;
      func.init(FuncSignature::build<uint32_t>(), cc.environment());

      FuncFrame frame;
      frame.init(func);
      frame.add_dirty_regs(rv);
      frame.finalize();

      cc.emit_prolog(frame);
      cc.mov(rv, 0);
      cc.emit_epilog(frame);
    }
    else {
      cc.mov(rv, 0);
      cc.ret();
    }

    return;
  }
#endif

  if (emitter.is_assembler()) {
    Assembler& cc = *emitter.as<Assembler>();

    Gp rv = eax;

    if (emit_prolog_epilog) {
      FuncDetail func;
      func.init(FuncSignature::build<uint32_t>(), cc.environment());

      FuncFrame frame;
      frame.init(func);
      frame.add_dirty_regs(rv);
      frame.finalize();

      cc.emit_prolog(frame);
      cc.mov(rv, 0);
      cc.emit_epilog(frame);
    }
    else {
      cc.mov(rv, 0);
      cc.ret();
    }

    return;
  }
}

static void generate_n_ops_sequence(BaseEmitter& emitter, uint32_t ops, bool emit_prolog_epilog) {
  using namespace asmjit::x86;

#ifndef ASMJIT_NO_COMPILER
  if (emitter.is_compiler()) {
    Compiler& cc = *emitter.as<Compiler>();

    Gp ra = cc.new_gp32("ra");
    Gp rb = cc.new_gp32("rb");
    Gp rc = cc.new_gp32("rc");
    Gp rd = cc.new_gp32("rd");

    FuncNode* f = cc.add_func(FuncSignature::build<uint32_t, uint32_t, uint32_t, uint32_t, uint32_t>());
    f->set_arg(0, ra);
    f->set_arg(1, rb);
    f->set_arg(2, rc);
    f->set_arg(3, rd);

    for (uint32_t i = 0; i < ops; i += 4) {
      cc.add(ra, rb);
      cc.imul(ra, rc);
      cc.sub(ra, rd);
      cc.imul(ra, rc);
    }

    cc.ret(ra);
    cc.end_func();

    return;
  }
#endif

#ifndef ASMJIT_NO_BUILDER
  if (emitter.is_builder()) {
    Builder& cc = *emitter.as<Builder>();

    Gp ra = eax;
    Gp rb = ebx;
    Gp rc = ecx;
    Gp rd = edx;

    if (emit_prolog_epilog) {
      FuncDetail func;
      func.init(FuncSignature::build<uint32_t, uint32_t, uint32_t, uint32_t, uint32_t>(), cc.environment());

      FuncFrame frame;
      frame.init(func);
      frame.add_dirty_regs(ra, rb, rc, rd);
      frame.finalize();

      cc.emit_prolog(frame);
      for (uint32_t i = 0; i < ops; i += 4) {
        cc.add(ra, rb);
        cc.imul(ra, rc);
        cc.sub(ra, rd);
        cc.imul(ra, rc);
      }
      cc.emit_epilog(frame);
    }
    else {
      for (uint32_t i = 0; i < ops; i += 4) {
        cc.add(ra, rb);
        cc.imul(ra, rc);
        cc.sub(ra, rd);
        cc.imul(ra, rc);
      }
      cc.ret();
    }

    return;
  }
#endif

  if (emitter.is_assembler()) {
    Assembler& cc = *emitter.as<Assembler>();

    Gp ra = eax;
    Gp rb = ebx;
    Gp rc = ecx;
    Gp rd = edx;

    if (emit_prolog_epilog) {
      FuncDetail func;
      func.init(FuncSignature::build<uint32_t, uint32_t, uint32_t, uint32_t, uint32_t>(), cc.environment());

      FuncFrame frame;
      frame.init(func);
      frame.add_dirty_regs(ra, rb, rc, rd);
      frame.finalize();

      cc.emit_prolog(frame);
      for (uint32_t i = 0; i < ops; i += 4) {
        cc.add(ra, rb);
        cc.imul(ra, rc);
        cc.sub(ra, rd);
        cc.imul(ra, rc);
      }
      cc.emit_epilog(frame);
    }
    else {
      for (uint32_t i = 0; i < ops; i += 4) {
        cc.add(ra, rb);
        cc.imul(ra, rc);
        cc.sub(ra, rd);
        cc.imul(ra, rc);
      }
      cc.ret();
    }

    return;
  }
}

static void generate_gp_sequence(BaseEmitter& emitter, InstForm form, bool emit_prolog_epilog) {
  using namespace asmjit::x86;

#ifndef ASMJIT_NO_COMPILER
  if (emitter.is_compiler()) {
    Compiler& cc = *emitter.as<Compiler>();

    Gp a = cc.new_gp_ptr("a");
    Gp b = cc.new_gp_ptr("b");
    Gp c = cc.new_gp_ptr("c");
    Gp d = cc.new_gp_ptr("d");

    cc.add_func(FuncSignature::build<void>());
    generate_gp_sequence_internal(cc, form, a, b, c, d);
    cc.end_func();

    return;
  }
#endif

#ifndef ASMJIT_NO_BUILDER
  if (emitter.is_builder()) {
    Builder& cc = *emitter.as<Builder>();

    x86::Gp a = cc.zax();
    x86::Gp b = cc.zbx();
    x86::Gp c = cc.zcx();
    x86::Gp d = cc.zdx();

    if (emit_prolog_epilog) {
      FuncDetail func;
      func.init(FuncSignature::build<void, void*, const void*, size_t>(), cc.environment());

      FuncFrame frame;
      frame.init(func);
      frame.add_dirty_regs(a, b, c, d);
      frame.finalize();

      cc.emit_prolog(frame);
      generate_gp_sequence_internal(cc, form, a, b, c, d);
      cc.emit_epilog(frame);
    }
    else {
      generate_gp_sequence_internal(cc, form, a, b, c, d);
    }

    return;
  }
#endif

  if (emitter.is_assembler()) {
    Assembler& cc = *emitter.as<Assembler>();

    x86::Gp a = cc.zax();
    x86::Gp b = cc.zbx();
    x86::Gp c = cc.zcx();
    x86::Gp d = cc.zdx();

    if (emit_prolog_epilog) {
      FuncDetail func;
      func.init(FuncSignature::build<void, void*, const void*, size_t>(), cc.environment());

      FuncFrame frame;
      frame.init(func);
      frame.add_dirty_regs(a, b, c, d);
      frame.finalize();

      cc.emit_prolog(frame);
      generate_gp_sequence_internal(cc, form, a, b, c, d);
      cc.emit_epilog(frame);
    }
    else {
      generate_gp_sequence_internal(cc, form, a, b, c, d);
    }

    return;
  }
}

// Generates a long sequence of SSE instructions using only registers.
template<typename Emitter>
static void generate_sse_sequenceInternal(
  Emitter& cc,
  InstForm form,
  const x86::Gp& gp,
  const x86::Vec& xmm_a, const x86::Vec& xmm_b, const x86::Vec& xmm_c, const x86::Vec& xmm_d) {

  x86::Gp gpd = gp.r32();
  x86::Gp gpq = gp.r64();
  x86::Gp gpz = cc.is_32bit() ? gpd : gpq;

  cc.xor_(gpd, gpd);
  cc.xorps(xmm_a, xmm_a);
  cc.xorps(xmm_b, xmm_b);
  cc.xorps(xmm_c, xmm_c);
  cc.xorps(xmm_d, xmm_d);

  if (form == InstForm::kReg) {
    // SSE.
    cc.addps(xmm_a, xmm_b);
    cc.addss(xmm_a, xmm_b);
    cc.andnps(xmm_a, xmm_b);
    cc.andps(xmm_a, xmm_b);
    cc.cmpps(xmm_a, xmm_b, 0);
    cc.cmpss(xmm_a, xmm_b, 0);
    cc.comiss(xmm_a, xmm_b);
    cc.cvtsi2ss(xmm_a, gpd);
    cc.cvtsi2ss(xmm_a, gpz);
    cc.cvtss2si(gpd, xmm_b);
    cc.cvtss2si(gpz, xmm_b);
    cc.cvttss2si(gpd, xmm_b);
    cc.cvttss2si(gpz, xmm_b);
    cc.divps(xmm_a, xmm_b);
    cc.divss(xmm_a, xmm_b);
    cc.maxps(xmm_a, xmm_b);
    cc.maxss(xmm_a, xmm_b);
    cc.minps(xmm_a, xmm_b);
    cc.minss(xmm_a, xmm_b);
    cc.movaps(xmm_a, xmm_b);
    cc.movd(gpd, xmm_b);
    cc.movd(xmm_a, gpd);
    cc.movq(xmm_a, xmm_b);
    cc.movhlps(xmm_a, xmm_b);
    cc.movlhps(xmm_a, xmm_b);
    cc.movups(xmm_a, xmm_b);
    cc.mulps(xmm_a, xmm_b);
    cc.mulss(xmm_a, xmm_b);
    cc.orps(xmm_a, xmm_b);
    cc.rcpps(xmm_a, xmm_b);
    cc.rcpss(xmm_a, xmm_b);
    cc.psadbw(xmm_a, xmm_b);
    cc.rsqrtps(xmm_a, xmm_b);
    cc.rsqrtss(xmm_a, xmm_b);
    cc.sfence();
    cc.shufps(xmm_a, xmm_b, 0);
    cc.sqrtps(xmm_a, xmm_b);
    cc.sqrtss(xmm_a, xmm_b);
    cc.subps(xmm_a, xmm_b);
    cc.subss(xmm_a, xmm_b);
    cc.ucomiss(xmm_a, xmm_b);
    cc.unpckhps(xmm_a, xmm_b);
    cc.unpcklps(xmm_a, xmm_b);
    cc.xorps(xmm_a, xmm_b);

    // SSE2.
    cc.addpd(xmm_a, xmm_b);
    cc.addsd(xmm_a, xmm_b);
    cc.andnpd(xmm_a, xmm_b);
    cc.andpd(xmm_a, xmm_b);
    cc.cmppd(xmm_a, xmm_b, 0);
    cc.cmpsd(xmm_a, xmm_b, 0);
    cc.comisd(xmm_a, xmm_b);
    cc.cvtdq2pd(xmm_a, xmm_b);
    cc.cvtdq2ps(xmm_a, xmm_b);
    cc.cvtpd2dq(xmm_a, xmm_b);
    cc.cvtpd2ps(xmm_a, xmm_b);
    cc.cvtps2dq(xmm_a, xmm_b);
    cc.cvtps2pd(xmm_a, xmm_b);
    cc.cvtsd2si(gpd, xmm_b);
    cc.cvtsd2si(gpz, xmm_b);
    cc.cvtsd2ss(xmm_a, xmm_b);
    cc.cvtsi2sd(xmm_a, gpd);
    cc.cvtsi2sd(xmm_a, gpz);
    cc.cvtss2sd(xmm_a, xmm_b);
    cc.cvtss2si(gpd, xmm_b);
    cc.cvtss2si(gpz, xmm_b);
    cc.cvttpd2dq(xmm_a, xmm_b);
    cc.cvttps2dq(xmm_a, xmm_b);
    cc.cvttsd2si(gpd, xmm_b);
    cc.cvttsd2si(gpz, xmm_b);
    cc.divpd(xmm_a, xmm_b);
    cc.divsd(xmm_a, xmm_b);
    cc.maxpd(xmm_a, xmm_b);
    cc.maxsd(xmm_a, xmm_b);
    cc.minpd(xmm_a, xmm_b);
    cc.minsd(xmm_a, xmm_b);
    cc.movdqa(xmm_a, xmm_b);
    cc.movdqu(xmm_a, xmm_b);
    cc.movmskps(gpd, xmm_b);
    cc.movmskpd(gpd, xmm_b);
    cc.movsd(xmm_a, xmm_b);
    cc.mulpd(xmm_a, xmm_b);
    cc.mulsd(xmm_a, xmm_b);
    cc.orpd(xmm_a, xmm_b);
    cc.packsswb(xmm_a, xmm_b);
    cc.packssdw(xmm_a, xmm_b);
    cc.packuswb(xmm_a, xmm_b);
    cc.paddb(xmm_a, xmm_b);
    cc.paddw(xmm_a, xmm_b);
    cc.paddd(xmm_a, xmm_b);
    cc.paddq(xmm_a, xmm_b);
    cc.paddsb(xmm_a, xmm_b);
    cc.paddsw(xmm_a, xmm_b);
    cc.paddusb(xmm_a, xmm_b);
    cc.paddusw(xmm_a, xmm_b);
    cc.pand(xmm_a, xmm_b);
    cc.pandn(xmm_a, xmm_b);
    cc.pavgb(xmm_a, xmm_b);
    cc.pavgw(xmm_a, xmm_b);
    cc.pcmpeqb(xmm_a, xmm_b);
    cc.pcmpeqw(xmm_a, xmm_b);
    cc.pcmpeqd(xmm_a, xmm_b);
    cc.pcmpgtb(xmm_a, xmm_b);
    cc.pcmpgtw(xmm_a, xmm_b);
    cc.pcmpgtd(xmm_a, xmm_b);
    cc.pmaxsw(xmm_a, xmm_b);
    cc.pmaxub(xmm_a, xmm_b);
    cc.pminsw(xmm_a, xmm_b);
    cc.pminub(xmm_a, xmm_b);
    cc.pmovmskb(gpd, xmm_b);
    cc.pmulhw(xmm_a, xmm_b);
    cc.pmulhuw(xmm_a, xmm_b);
    cc.pmullw(xmm_a, xmm_b);
    cc.pmuludq(xmm_a, xmm_b);
    cc.por(xmm_a, xmm_b);
    cc.pslld(xmm_a, xmm_b);
    cc.pslld(xmm_a, 0);
    cc.psllq(xmm_a, xmm_b);
    cc.psllq(xmm_a, 0);
    cc.psllw(xmm_a, xmm_b);
    cc.psllw(xmm_a, 0);
    cc.pslldq(xmm_a, 0);
    cc.psrad(xmm_a, xmm_b);
    cc.psrad(xmm_a, 0);
    cc.psraw(xmm_a, xmm_b);
    cc.psraw(xmm_a, 0);
    cc.psubb(xmm_a, xmm_b);
    cc.psubw(xmm_a, xmm_b);
    cc.psubd(xmm_a, xmm_b);
    cc.psubq(xmm_a, xmm_b);
    cc.pmaddwd(xmm_a, xmm_b);
    cc.pshufd(xmm_a, xmm_b, 0);
    cc.pshufhw(xmm_a, xmm_b, 0);
    cc.pshuflw(xmm_a, xmm_b, 0);
    cc.psrld(xmm_a, xmm_b);
    cc.psrld(xmm_a, 0);
    cc.psrlq(xmm_a, xmm_b);
    cc.psrlq(xmm_a, 0);
    cc.psrldq(xmm_a, 0);
    cc.psrlw(xmm_a, xmm_b);
    cc.psrlw(xmm_a, 0);
    cc.psubsb(xmm_a, xmm_b);
    cc.psubsw(xmm_a, xmm_b);
    cc.psubusb(xmm_a, xmm_b);
    cc.psubusw(xmm_a, xmm_b);
    cc.punpckhbw(xmm_a, xmm_b);
    cc.punpckhwd(xmm_a, xmm_b);
    cc.punpckhdq(xmm_a, xmm_b);
    cc.punpckhqdq(xmm_a, xmm_b);
    cc.punpcklbw(xmm_a, xmm_b);
    cc.punpcklwd(xmm_a, xmm_b);
    cc.punpckldq(xmm_a, xmm_b);
    cc.punpcklqdq(xmm_a, xmm_b);
    cc.pxor(xmm_a, xmm_b);
    cc.sqrtpd(xmm_a, xmm_b);
    cc.sqrtsd(xmm_a, xmm_b);
    cc.subpd(xmm_a, xmm_b);
    cc.subsd(xmm_a, xmm_b);
    cc.ucomisd(xmm_a, xmm_b);
    cc.unpckhpd(xmm_a, xmm_b);
    cc.unpcklpd(xmm_a, xmm_b);
    cc.xorpd(xmm_a, xmm_b);

    // SSE3.
    cc.addsubpd(xmm_a, xmm_b);
    cc.addsubps(xmm_a, xmm_b);
    cc.haddpd(xmm_a, xmm_b);
    cc.haddps(xmm_a, xmm_b);
    cc.hsubpd(xmm_a, xmm_b);
    cc.hsubps(xmm_a, xmm_b);
    cc.movddup(xmm_a, xmm_b);
    cc.movshdup(xmm_a, xmm_b);
    cc.movsldup(xmm_a, xmm_b);

    // SSSE3.
    cc.psignb(xmm_a, xmm_b);
    cc.psignw(xmm_a, xmm_b);
    cc.psignd(xmm_a, xmm_b);
    cc.phaddw(xmm_a, xmm_b);
    cc.phaddd(xmm_a, xmm_b);
    cc.phaddsw(xmm_a, xmm_b);
    cc.phsubw(xmm_a, xmm_b);
    cc.phsubd(xmm_a, xmm_b);
    cc.phsubsw(xmm_a, xmm_b);
    cc.pmaddubsw(xmm_a, xmm_b);
    cc.pabsb(xmm_a, xmm_b);
    cc.pabsw(xmm_a, xmm_b);
    cc.pabsd(xmm_a, xmm_b);
    cc.pmulhrsw(xmm_a, xmm_b);
    cc.pshufb(xmm_a, xmm_b);
    cc.palignr(xmm_a, xmm_b, 0);

    // SSE4.1.
    cc.blendpd(xmm_a, xmm_b, 0);
    cc.blendps(xmm_a, xmm_b, 0);
    cc.blendvpd(xmm_a, xmm_b, xmm_a);
    cc.blendvps(xmm_a, xmm_b, xmm_a);

    cc.dppd(xmm_a, xmm_b, 0);
    cc.dpps(xmm_a, xmm_b, 0);
    cc.extractps(gpd, xmm_b, 0);
    cc.insertps(xmm_a, xmm_b, 0);
    cc.mpsadbw(xmm_a, xmm_b, 0);
    cc.packusdw(xmm_a, xmm_b);
    cc.pblendvb(xmm_a, xmm_b, xmm_a);
    cc.pblendw(xmm_a, xmm_b, 0);
    cc.pcmpeqq(xmm_a, xmm_b);
    cc.pextrb(gpd, xmm_b, 0);
    cc.pextrd(gpd, xmm_b, 0);
    if (cc.is_64bit()) cc.pextrq(gpq, xmm_b, 0);
    cc.pextrw(gpd, xmm_b, 0);
    cc.phminposuw(xmm_a, xmm_b);
    cc.pinsrb(xmm_a, gpd, 0);
    cc.pinsrd(xmm_a, gpd, 0);
    cc.pinsrw(xmm_a, gpd, 0);
    cc.pmaxuw(xmm_a, xmm_b);
    cc.pmaxsb(xmm_a, xmm_b);
    cc.pmaxsd(xmm_a, xmm_b);
    cc.pmaxud(xmm_a, xmm_b);
    cc.pminsb(xmm_a, xmm_b);
    cc.pminuw(xmm_a, xmm_b);
    cc.pminud(xmm_a, xmm_b);
    cc.pminsd(xmm_a, xmm_b);
    cc.pmovsxbw(xmm_a, xmm_b);
    cc.pmovsxbd(xmm_a, xmm_b);
    cc.pmovsxbq(xmm_a, xmm_b);
    cc.pmovsxwd(xmm_a, xmm_b);
    cc.pmovsxwq(xmm_a, xmm_b);
    cc.pmovsxdq(xmm_a, xmm_b);
    cc.pmovzxbw(xmm_a, xmm_b);
    cc.pmovzxbd(xmm_a, xmm_b);
    cc.pmovzxbq(xmm_a, xmm_b);
    cc.pmovzxwd(xmm_a, xmm_b);
    cc.pmovzxwq(xmm_a, xmm_b);
    cc.pmovzxdq(xmm_a, xmm_b);
    cc.pmuldq(xmm_a, xmm_b);
    cc.pmulld(xmm_a, xmm_b);
    cc.ptest(xmm_a, xmm_b);
    cc.roundps(xmm_a, xmm_b, 0);
    cc.roundss(xmm_a, xmm_b, 0);
    cc.roundpd(xmm_a, xmm_b, 0);
    cc.roundsd(xmm_a, xmm_b, 0);
  }
  else {
    x86::Mem m = x86::ptr(gpz);

    cc.addps(xmm_a, m);
    cc.addss(xmm_a, m);
    cc.andnps(xmm_a, m);
    cc.andps(xmm_a, m);
    cc.cmpps(xmm_a, m, 0);
    cc.cmpss(xmm_a, m, 0);
    cc.comiss(xmm_a, m);
    cc.cvtpi2ps(xmm_a, m);
    cc.cvtsi2ss(xmm_a, m);
    cc.cvtss2si(gpd, m);
    cc.cvtss2si(gpz, m);
    cc.cvttss2si(gpd, m);
    cc.cvttss2si(gpz, m);
    cc.divps(xmm_a, m);
    cc.divss(xmm_a, m);
    cc.maxps(xmm_a, m);
    cc.maxss(xmm_a, m);
    cc.minps(xmm_a, m);
    cc.minss(xmm_a, m);
    cc.movaps(xmm_a, m);
    cc.movaps(m, xmm_b);
    cc.movd(m, xmm_b);
    cc.movd(xmm_a, m);
    cc.movq(m, xmm_b);
    cc.movq(xmm_a, m);
    cc.movhps(xmm_a, m);
    cc.movhps(m, xmm_b);
    cc.movlps(xmm_a, m);
    cc.movlps(m, xmm_b);
    cc.movntps(m, xmm_b);
    cc.movss(xmm_a, m);
    cc.movss(m, xmm_b);
    cc.movups(xmm_a, m);
    cc.movups(m, xmm_b);
    cc.mulps(xmm_a, m);
    cc.mulss(xmm_a, m);
    cc.orps(xmm_a, m);
    cc.rcpps(xmm_a, m);
    cc.rcpss(xmm_a, m);
    cc.psadbw(xmm_a, m);
    cc.rsqrtps(xmm_a, m);
    cc.rsqrtss(xmm_a, m);
    cc.shufps(xmm_a, m, 0);
    cc.sqrtps(xmm_a, m);
    cc.sqrtss(xmm_a, m);
    cc.stmxcsr(m);
    cc.subps(xmm_a, m);
    cc.subss(xmm_a, m);
    cc.ucomiss(xmm_a, m);
    cc.unpckhps(xmm_a, m);
    cc.unpcklps(xmm_a, m);
    cc.xorps(xmm_a, m);

    // SSE2.
    cc.addpd(xmm_a, m);
    cc.addsd(xmm_a, m);
    cc.andnpd(xmm_a, m);
    cc.andpd(xmm_a, m);
    cc.cmppd(xmm_a, m, 0);
    cc.cmpsd(xmm_a, m, 0);
    cc.comisd(xmm_a, m);
    cc.cvtdq2pd(xmm_a, m);
    cc.cvtdq2ps(xmm_a, m);
    cc.cvtpd2dq(xmm_a, m);
    cc.cvtpd2ps(xmm_a, m);
    cc.cvtpi2pd(xmm_a, m);
    cc.cvtps2dq(xmm_a, m);
    cc.cvtps2pd(xmm_a, m);
    cc.cvtsd2si(gpd, m);
    cc.cvtsd2si(gpz, m);
    cc.cvtsd2ss(xmm_a, m);
    cc.cvtsi2sd(xmm_a, m);
    cc.cvtss2sd(xmm_a, m);
    cc.cvtss2si(gpd, m);
    cc.cvtss2si(gpz, m);
    cc.cvttpd2dq(xmm_a, m);
    cc.cvttps2dq(xmm_a, m);
    cc.cvttsd2si(gpd, m);
    cc.cvttsd2si(gpz, m);
    cc.divpd(xmm_a, m);
    cc.divsd(xmm_a, m);
    cc.maxpd(xmm_a, m);
    cc.maxsd(xmm_a, m);
    cc.minpd(xmm_a, m);
    cc.minsd(xmm_a, m);
    cc.movdqa(xmm_a, m);
    cc.movdqa(m, xmm_b);
    cc.movdqu(xmm_a, m);
    cc.movdqu(m, xmm_b);
    cc.movsd(xmm_a, m);
    cc.movsd(m, xmm_b);
    cc.movapd(xmm_a, m);
    cc.movapd(m, xmm_b);
    cc.movhpd(xmm_a, m);
    cc.movhpd(m, xmm_b);
    cc.movlpd(xmm_a, m);
    cc.movlpd(m, xmm_b);
    cc.movntdq(m, xmm_b);
    cc.movntpd(m, xmm_b);
    cc.movupd(xmm_a, m);
    cc.movupd(m, xmm_b);
    cc.mulpd(xmm_a, m);
    cc.mulsd(xmm_a, m);
    cc.orpd(xmm_a, m);
    cc.packsswb(xmm_a, m);
    cc.packssdw(xmm_a, m);
    cc.packuswb(xmm_a, m);
    cc.paddb(xmm_a, m);
    cc.paddw(xmm_a, m);
    cc.paddd(xmm_a, m);
    cc.paddq(xmm_a, m);
    cc.paddsb(xmm_a, m);
    cc.paddsw(xmm_a, m);
    cc.paddusb(xmm_a, m);
    cc.paddusw(xmm_a, m);
    cc.pand(xmm_a, m);
    cc.pandn(xmm_a, m);
    cc.pavgb(xmm_a, m);
    cc.pavgw(xmm_a, m);
    cc.pcmpeqb(xmm_a, m);
    cc.pcmpeqw(xmm_a, m);
    cc.pcmpeqd(xmm_a, m);
    cc.pcmpgtb(xmm_a, m);
    cc.pcmpgtw(xmm_a, m);
    cc.pcmpgtd(xmm_a, m);
    cc.pmaxsw(xmm_a, m);
    cc.pmaxub(xmm_a, m);
    cc.pminsw(xmm_a, m);
    cc.pminub(xmm_a, m);
    cc.pmulhw(xmm_a, m);
    cc.pmulhuw(xmm_a, m);
    cc.pmullw(xmm_a, m);
    cc.pmuludq(xmm_a, m);
    cc.por(xmm_a, m);
    cc.pslld(xmm_a, m);
    cc.psllq(xmm_a, m);
    cc.psllw(xmm_a, m);
    cc.psrad(xmm_a, m);
    cc.psraw(xmm_a, m);
    cc.psubb(xmm_a, m);
    cc.psubw(xmm_a, m);
    cc.psubd(xmm_a, m);
    cc.psubq(xmm_a, m);
    cc.pmaddwd(xmm_a, m);
    cc.pshufd(xmm_a, m, 0);
    cc.pshufhw(xmm_a, m, 0);
    cc.pshuflw(xmm_a, m, 0);
    cc.psrld(xmm_a, m);
    cc.psrlq(xmm_a, m);
    cc.psrlw(xmm_a, m);
    cc.psubsb(xmm_a, m);
    cc.psubsw(xmm_a, m);
    cc.psubusb(xmm_a, m);
    cc.psubusw(xmm_a, m);
    cc.punpckhbw(xmm_a, m);
    cc.punpckhwd(xmm_a, m);
    cc.punpckhdq(xmm_a, m);
    cc.punpckhqdq(xmm_a, m);
    cc.punpcklbw(xmm_a, m);
    cc.punpcklwd(xmm_a, m);
    cc.punpckldq(xmm_a, m);
    cc.punpcklqdq(xmm_a, m);
    cc.pxor(xmm_a, m);
    cc.sqrtpd(xmm_a, m);
    cc.sqrtsd(xmm_a, m);
    cc.subpd(xmm_a, m);
    cc.subsd(xmm_a, m);
    cc.ucomisd(xmm_a, m);
    cc.unpckhpd(xmm_a, m);
    cc.unpcklpd(xmm_a, m);
    cc.xorpd(xmm_a, m);

    // SSE3.
    cc.addsubpd(xmm_a, m);
    cc.addsubps(xmm_a, m);
    cc.haddpd(xmm_a, m);
    cc.haddps(xmm_a, m);
    cc.hsubpd(xmm_a, m);
    cc.hsubps(xmm_a, m);
    cc.lddqu(xmm_a, m);
    cc.movddup(xmm_a, m);
    cc.movshdup(xmm_a, m);
    cc.movsldup(xmm_a, m);

    // SSSE3.
    cc.psignb(xmm_a, m);
    cc.psignw(xmm_a, m);
    cc.psignd(xmm_a, m);
    cc.phaddw(xmm_a, m);
    cc.phaddd(xmm_a, m);
    cc.phaddsw(xmm_a, m);
    cc.phsubw(xmm_a, m);
    cc.phsubd(xmm_a, m);
    cc.phsubsw(xmm_a, m);
    cc.pmaddubsw(xmm_a, m);
    cc.pabsb(xmm_a, m);
    cc.pabsw(xmm_a, m);
    cc.pabsd(xmm_a, m);
    cc.pmulhrsw(xmm_a, m);
    cc.pshufb(xmm_a, m);
    cc.palignr(xmm_a, m, 0);

    // SSE4.1.
    cc.blendpd(xmm_a, m, 0);
    cc.blendps(xmm_a, m, 0);
    cc.blendvpd(xmm_a, m, xmm_a);
    cc.blendvps(xmm_a, m, xmm_a);

    cc.dppd(xmm_a, m, 0);
    cc.dpps(xmm_a, m, 0);
    cc.extractps(m, xmm_b, 0);
    cc.insertps(xmm_a, m, 0);
    cc.movntdqa(xmm_a, m);
    cc.mpsadbw(xmm_a, m, 0);
    cc.packusdw(xmm_a, m);
    cc.pblendvb(xmm_a, m, xmm_a);
    cc.pblendw(xmm_a, m, 0);
    cc.pcmpeqq(xmm_a, m);
    cc.pextrb(m, xmm_b, 0);
    cc.pextrd(m, xmm_b, 0);
    if (cc.is_64bit()) cc.pextrq(m, xmm_b, 0);
    cc.pextrw(m, xmm_b, 0);
    cc.phminposuw(xmm_a, m);
    cc.pinsrb(xmm_a, m, 0);
    cc.pinsrd(xmm_a, m, 0);
    cc.pinsrw(xmm_a, m, 0);
    cc.pmaxuw(xmm_a, m);
    cc.pmaxsb(xmm_a, m);
    cc.pmaxsd(xmm_a, m);
    cc.pmaxud(xmm_a, m);
    cc.pminsb(xmm_a, m);
    cc.pminuw(xmm_a, m);
    cc.pminud(xmm_a, m);
    cc.pminsd(xmm_a, m);
    cc.pmovsxbw(xmm_a, m);
    cc.pmovsxbd(xmm_a, m);
    cc.pmovsxbq(xmm_a, m);
    cc.pmovsxwd(xmm_a, m);
    cc.pmovsxwq(xmm_a, m);
    cc.pmovsxdq(xmm_a, m);
    cc.pmovzxbw(xmm_a, m);
    cc.pmovzxbd(xmm_a, m);
    cc.pmovzxbq(xmm_a, m);
    cc.pmovzxwd(xmm_a, m);
    cc.pmovzxwq(xmm_a, m);
    cc.pmovzxdq(xmm_a, m);
    cc.pmuldq(xmm_a, m);
    cc.pmulld(xmm_a, m);
    cc.ptest(xmm_a, m);
    cc.roundps(xmm_a, m, 0);
    cc.roundss(xmm_a, m, 0);
    cc.roundpd(xmm_a, m, 0);
    cc.roundsd(xmm_a, m, 0);

    // SSE4.2.
    cc.pcmpgtq(xmm_a, m);
  }
}

static void generate_sse_sequence(BaseEmitter& emitter, InstForm form, bool emit_prolog_epilog) {
  using namespace asmjit::x86;

#ifndef ASMJIT_NO_COMPILER
  if (emitter.is_compiler()) {
    Compiler& cc = *emitter.as<Compiler>();

    Gp gp = cc.new_gpz("gp");
    Vec a = cc.new_xmm("a");
    Vec b = cc.new_xmm("b");
    Vec c = cc.new_xmm("c");
    Vec d = cc.new_xmm("d");

    cc.add_func(FuncSignature::build<void>());
    generate_sse_sequenceInternal(cc, form, gp, a, b, c, d);
    cc.end_func();

    return;
  }
#endif

#ifndef ASMJIT_NO_BUILDER
  if (emitter.is_builder()) {
    Builder& cc = *emitter.as<Builder>();

    if (emit_prolog_epilog) {
      FuncDetail func;
      func.init(FuncSignature::build<void, void*, const void*, size_t>(), cc.environment());

      FuncFrame frame;
      frame.init(func);
      frame.add_dirty_regs(eax, xmm0, xmm1, xmm2, xmm3);
      frame.finalize();

      cc.emit_prolog(frame);
      generate_sse_sequenceInternal(cc, form, eax, xmm0, xmm1, xmm2, xmm3);
      cc.emit_epilog(frame);
    }
    else {
      generate_sse_sequenceInternal(cc, form, eax, xmm0, xmm1, xmm2, xmm3);
    }

    return;
  }
#endif

  if (emitter.is_assembler()) {
    Assembler& cc = *emitter.as<Assembler>();

    if (emit_prolog_epilog) {
      FuncDetail func;
      func.init(FuncSignature::build<void, void*, const void*, size_t>(), cc.environment());

      FuncFrame frame;
      frame.init(func);
      frame.add_dirty_regs(eax, xmm0, xmm1, xmm2, xmm3);
      frame.finalize();

      cc.emit_prolog(frame);
      generate_sse_sequenceInternal(cc, form, eax, xmm0, xmm1, xmm2, xmm3);
      cc.emit_epilog(frame);
    }
    else {
      generate_sse_sequenceInternal(cc, form, eax, xmm0, xmm1, xmm2, xmm3);
    }

    return;
  }
}

// Generates a long sequence of AVX instructions.
template<typename Emitter>
static void generate_avx_sequenceInternalRegOnly(
  Emitter& cc,
  const x86::Gp& gp,
  const x86::Vec& vec_a, const x86::Vec& vec_b, const x86::Vec& vec_c, const x86::Vec& vec_d) {

  x86::Gp gpd = gp.r32();
  x86::Gp gpq = gp.r64();
  x86::Gp gpz = cc.is_32bit() ? gpd : gpq;

  x86::Vec xmm_a = vec_a.xmm();
  x86::Vec xmm_b = vec_b.xmm();
  x86::Vec xmm_c = vec_c.xmm();
  x86::Vec xmm_d = vec_d.xmm();

  x86::Vec ymm_a = vec_a.ymm();
  x86::Vec ymm_b = vec_b.ymm();
  x86::Vec ymm_c = vec_c.ymm();

  cc.xor_(gpd, gpd);
  cc.vxorps(xmm_a, xmm_a, xmm_a);
  cc.vxorps(xmm_b, xmm_b, xmm_b);
  cc.vxorps(xmm_c, xmm_c, xmm_c);
  cc.vxorps(xmm_d, xmm_d, xmm_d);

  cc.vaddpd(xmm_a, xmm_b, xmm_c);
  cc.vaddpd(ymm_a, ymm_b, ymm_c);
  cc.vaddps(xmm_a, xmm_b, xmm_c);
  cc.vaddps(ymm_a, ymm_b, ymm_c);
  cc.vaddsd(xmm_a, xmm_b, xmm_c);
  cc.vaddss(xmm_a, xmm_b, xmm_c);
  cc.vaddsubpd(xmm_a, xmm_b, xmm_c);
  cc.vaddsubpd(ymm_a, ymm_b, ymm_c);
  cc.vaddsubps(xmm_a, xmm_b, xmm_c);
  cc.vaddsubps(ymm_a, ymm_b, ymm_c);
  cc.vandpd(xmm_a, xmm_b, xmm_c);
  cc.vandpd(ymm_a, ymm_b, ymm_c);
  cc.vandps(xmm_a, xmm_b, xmm_c);
  cc.vandps(ymm_a, ymm_b, ymm_c);
  cc.vandnpd(xmm_a, xmm_b, xmm_c);
  cc.vandnpd(ymm_a, ymm_b, ymm_c);
  cc.vandnps(xmm_a, xmm_b, xmm_c);
  cc.vandnps(ymm_a, ymm_b, ymm_c);
  cc.vblendpd(xmm_a, xmm_b, xmm_c, 0);
  cc.vblendpd(ymm_a, ymm_b, ymm_c, 0);
  cc.vblendps(xmm_a, xmm_b, xmm_c, 0);
  cc.vblendps(ymm_a, ymm_b, ymm_c, 0);
  cc.vblendvpd(xmm_a, xmm_b, xmm_c, xmm_a);
  cc.vblendvpd(ymm_a, ymm_b, ymm_c, ymm_a);
  cc.vcmppd(xmm_a, xmm_b, xmm_c, 0);
  cc.vcmppd(ymm_a, ymm_b, ymm_c, 0);
  cc.vcmpps(xmm_a, xmm_b, xmm_c, 0);
  cc.vcmpps(ymm_a, ymm_b, ymm_c, 0);
  cc.vcmpsd(xmm_a, xmm_b, xmm_c, 0);
  cc.vcmpss(xmm_a, xmm_b, xmm_c, 0);
  cc.vcomisd(xmm_a, xmm_b);
  cc.vcomiss(xmm_a, xmm_b);
  cc.vcvtdq2pd(xmm_a, xmm_b);
  cc.vcvtdq2pd(ymm_a, xmm_b);
  cc.vcvtdq2ps(xmm_a, xmm_b);
  cc.vcvtdq2ps(ymm_a, ymm_b);
  cc.vcvtpd2dq(xmm_a, xmm_b);
  cc.vcvtpd2dq(xmm_a, ymm_b);
  cc.vcvtpd2ps(xmm_a, xmm_b);
  cc.vcvtpd2ps(xmm_a, ymm_b);
  cc.vcvtps2dq(xmm_a, xmm_b);
  cc.vcvtps2dq(ymm_a, ymm_b);
  cc.vcvtps2pd(xmm_a, xmm_b);
  cc.vcvtps2pd(ymm_a, xmm_b);
  cc.vcvtsd2si(gpd, xmm_b);
  cc.vcvtsd2si(gpz, xmm_b);
  cc.vcvtsd2ss(xmm_a, xmm_b, xmm_c);
  cc.vcvtsi2sd(xmm_a, xmm_b, gpd);
  cc.vcvtsi2sd(xmm_a, xmm_b, gpz);
  cc.vcvtsi2ss(xmm_a, xmm_b, gpd);
  cc.vcvtsi2ss(xmm_a, xmm_b, gpz);
  cc.vcvtss2sd(xmm_a, xmm_b, xmm_c);
  cc.vcvtss2si(gpd, xmm_b);
  cc.vcvttpd2dq(xmm_a, xmm_b);
  cc.vcvttpd2dq(xmm_a, ymm_b);
  cc.vcvttps2dq(xmm_a, xmm_b);
  cc.vcvttps2dq(ymm_a, ymm_b);
  cc.vcvttsd2si(gpd, xmm_b);
  cc.vcvttss2si(gpz, xmm_b);
  cc.vdivpd(xmm_a, xmm_b, xmm_c);
  cc.vdivpd(ymm_a, ymm_b, ymm_c);
  cc.vdivps(xmm_a, xmm_b, xmm_c);
  cc.vdivps(ymm_a, ymm_b, ymm_c);
  cc.vdivsd(xmm_a, xmm_b, xmm_c);
  cc.vdivss(xmm_a, xmm_b, xmm_c);
  cc.vdppd(xmm_a, xmm_b, xmm_c, 0);
  cc.vdpps(xmm_a, xmm_b, xmm_c, 0);
  cc.vdpps(ymm_a, ymm_b, ymm_c, 0);
  cc.vextractf128(xmm_a, ymm_b, 0);
  cc.vextractps(gpd, xmm_b, 0);
  cc.vhaddpd(xmm_a, xmm_b, xmm_c);
  cc.vhaddpd(ymm_a, ymm_b, ymm_c);
  cc.vhaddps(xmm_a, xmm_b, xmm_c);
  cc.vhaddps(ymm_a, ymm_b, ymm_c);
  cc.vhsubpd(xmm_a, xmm_b, xmm_c);
  cc.vhsubpd(ymm_a, ymm_b, ymm_c);
  cc.vhsubps(xmm_a, xmm_b, xmm_c);
  cc.vhsubps(ymm_a, ymm_b, ymm_c);
  cc.vinsertf128(ymm_a, ymm_b, xmm_c, 0);
  cc.vinsertps(xmm_a, xmm_b, xmm_c, 0);
  cc.vmaxpd(xmm_a, xmm_b, xmm_c);
  cc.vmaxpd(ymm_a, ymm_b, ymm_c);
  cc.vmaxps(xmm_a, xmm_b, xmm_c);
  cc.vmaxps(ymm_a, ymm_b, ymm_c);
  cc.vmaxsd(xmm_a, xmm_b, xmm_c);
  cc.vmaxss(xmm_a, xmm_b, xmm_c);
  cc.vminpd(xmm_a, xmm_b, xmm_c);
  cc.vminpd(ymm_a, ymm_b, ymm_c);
  cc.vminps(xmm_a, xmm_b, xmm_c);
  cc.vminps(ymm_a, ymm_b, ymm_c);
  cc.vminsd(xmm_a, xmm_b, xmm_c);
  cc.vminss(xmm_a, xmm_b, xmm_c);
  cc.vmovapd(xmm_a, xmm_b);
  cc.vmovapd(ymm_a, ymm_b);
  cc.vmovaps(xmm_a, xmm_b);
  cc.vmovaps(ymm_a, ymm_b);
  cc.vmovd(xmm_a, gpd);
  cc.vmovd(gpd, xmm_b);
  cc.vmovddup(xmm_a, xmm_b);
  cc.vmovddup(ymm_a, ymm_b);
  cc.vmovdqa(xmm_a, xmm_b);
  cc.vmovdqa(ymm_a, ymm_b);
  cc.vmovdqu(xmm_a, xmm_b);
  cc.vmovdqu(ymm_a, ymm_b);
  cc.vmovhlps(xmm_a, xmm_b, xmm_c);
  cc.vmovlhps(xmm_a, xmm_b, xmm_c);
  cc.vmovmskpd(gpd, xmm_b);
  cc.vmovmskpd(gpd, ymm_b);
  cc.vmovmskps(gpd, xmm_b);
  cc.vmovmskps(gpd, ymm_b);
  cc.vmovsd(xmm_a, xmm_b, xmm_c);
  cc.vmovshdup(xmm_a, xmm_b);
  cc.vmovshdup(ymm_a, ymm_b);
  cc.vmovsldup(xmm_a, xmm_b);
  cc.vmovsldup(ymm_a, ymm_b);
  cc.vmovss(xmm_a, xmm_b, xmm_c);
  cc.vmovupd(xmm_a, xmm_b);
  cc.vmovupd(ymm_a, ymm_b);
  cc.vmovups(xmm_a, xmm_b);
  cc.vmovups(ymm_a, ymm_b);
  cc.vmpsadbw(xmm_a, xmm_b, xmm_c, 0);
  cc.vmulpd(xmm_a, xmm_b, xmm_c);
  cc.vmulpd(ymm_a, ymm_b, ymm_c);
  cc.vmulps(xmm_a, xmm_b, xmm_c);
  cc.vmulps(ymm_a, ymm_b, ymm_c);
  cc.vmulsd(xmm_a, xmm_b, xmm_c);
  cc.vmulss(xmm_a, xmm_b, xmm_c);
  cc.vorpd(xmm_a, xmm_b, xmm_c);
  cc.vorpd(ymm_a, ymm_b, ymm_c);
  cc.vorps(xmm_a, xmm_b, xmm_c);
  cc.vorps(ymm_a, ymm_b, ymm_c);
  cc.vpabsb(xmm_a, xmm_b);
  cc.vpabsd(xmm_a, xmm_b);
  cc.vpabsw(xmm_a, xmm_b);
  cc.vpackssdw(xmm_a, xmm_b, xmm_c);
  cc.vpacksswb(xmm_a, xmm_b, xmm_c);
  cc.vpackusdw(xmm_a, xmm_b, xmm_c);
  cc.vpackuswb(xmm_a, xmm_b, xmm_c);
  cc.vpaddb(xmm_a, xmm_b, xmm_c);
  cc.vpaddd(xmm_a, xmm_b, xmm_c);
  cc.vpaddq(xmm_a, xmm_b, xmm_c);
  cc.vpaddw(xmm_a, xmm_b, xmm_c);
  cc.vpaddsb(xmm_a, xmm_b, xmm_c);
  cc.vpaddsw(xmm_a, xmm_b, xmm_c);
  cc.vpaddusb(xmm_a, xmm_b, xmm_c);
  cc.vpaddusw(xmm_a, xmm_b, xmm_c);
  cc.vpalignr(xmm_a, xmm_b, xmm_c, 0);
  cc.vpand(xmm_a, xmm_b, xmm_c);
  cc.vpandn(xmm_a, xmm_b, xmm_c);
  cc.vpavgb(xmm_a, xmm_b, xmm_c);
  cc.vpavgw(xmm_a, xmm_b, xmm_c);
  cc.vpblendvb(xmm_a, xmm_b, xmm_c, xmm_a);
  cc.vpblendw(xmm_a, xmm_b, xmm_c, 0);
  cc.vpcmpeqb(xmm_a, xmm_b, xmm_c);
  cc.vpcmpeqd(xmm_a, xmm_b, xmm_c);
  cc.vpcmpeqq(xmm_a, xmm_b, xmm_c);
  cc.vpcmpeqw(xmm_a, xmm_b, xmm_c);
  cc.vpcmpgtb(xmm_a, xmm_b, xmm_c);
  cc.vpcmpgtd(xmm_a, xmm_b, xmm_c);
  cc.vpcmpgtq(xmm_a, xmm_b, xmm_c);
  cc.vpcmpgtw(xmm_a, xmm_b, xmm_c);
  cc.vpermilpd(xmm_a, xmm_b, xmm_c);
  cc.vpermilpd(ymm_a, ymm_b, ymm_c);
  cc.vpermilpd(xmm_a, xmm_b, 0);
  cc.vpermilpd(ymm_a, ymm_b, 0);
  cc.vpermilps(xmm_a, xmm_b, xmm_c);
  cc.vpermilps(ymm_a, ymm_b, ymm_c);
  cc.vpermilps(xmm_a, xmm_b, 0);
  cc.vpermilps(ymm_a, ymm_b, 0);
  cc.vperm2f128(ymm_a, ymm_b, ymm_c, 0);
  cc.vpextrb(gpd, xmm_b, 0);
  cc.vpextrd(gpd, xmm_b, 0);
  if (cc.is_64bit()) cc.vpextrq(gpq, xmm_b, 0);
  cc.vpextrw(gpd, xmm_b, 0);
  cc.vphaddd(xmm_a, xmm_b, xmm_c);
  cc.vphaddsw(xmm_a, xmm_b, xmm_c);
  cc.vphaddw(xmm_a, xmm_b, xmm_c);
  cc.vphminposuw(xmm_a, xmm_b);
  cc.vphsubd(xmm_a, xmm_b, xmm_c);
  cc.vphsubsw(xmm_a, xmm_b, xmm_c);
  cc.vphsubw(xmm_a, xmm_b, xmm_c);
  cc.vpinsrb(xmm_a, xmm_b, gpd, 0);
  cc.vpinsrd(xmm_a, xmm_b, gpd, 0);
  cc.vpinsrw(xmm_a, xmm_b, gpd, 0);
  cc.vpmaddubsw(xmm_a, xmm_b, xmm_c);
  cc.vpmaddwd(xmm_a, xmm_b, xmm_c);
  cc.vpmaxsb(xmm_a, xmm_b, xmm_c);
  cc.vpmaxsd(xmm_a, xmm_b, xmm_c);
  cc.vpmaxsw(xmm_a, xmm_b, xmm_c);
  cc.vpmaxub(xmm_a, xmm_b, xmm_c);
  cc.vpmaxud(xmm_a, xmm_b, xmm_c);
  cc.vpmaxuw(xmm_a, xmm_b, xmm_c);
  cc.vpminsb(xmm_a, xmm_b, xmm_c);
  cc.vpminsd(xmm_a, xmm_b, xmm_c);
  cc.vpminsw(xmm_a, xmm_b, xmm_c);
  cc.vpminub(xmm_a, xmm_b, xmm_c);
  cc.vpminud(xmm_a, xmm_b, xmm_c);
  cc.vpminuw(xmm_a, xmm_b, xmm_c);
  cc.vpmovmskb(gpd, xmm_b);
  cc.vpmovsxbd(xmm_a, xmm_b);
  cc.vpmovsxbq(xmm_a, xmm_b);
  cc.vpmovsxbw(xmm_a, xmm_b);
  cc.vpmovsxdq(xmm_a, xmm_b);
  cc.vpmovsxwd(xmm_a, xmm_b);
  cc.vpmovsxwq(xmm_a, xmm_b);
  cc.vpmovzxbd(xmm_a, xmm_b);
  cc.vpmovzxbq(xmm_a, xmm_b);
  cc.vpmovzxbw(xmm_a, xmm_b);
  cc.vpmovzxdq(xmm_a, xmm_b);
  cc.vpmovzxwd(xmm_a, xmm_b);
  cc.vpmovzxwq(xmm_a, xmm_b);
  cc.vpmuldq(xmm_a, xmm_b, xmm_c);
  cc.vpmulhrsw(xmm_a, xmm_b, xmm_c);
  cc.vpmulhuw(xmm_a, xmm_b, xmm_c);
  cc.vpmulhw(xmm_a, xmm_b, xmm_c);
  cc.vpmulld(xmm_a, xmm_b, xmm_c);
  cc.vpmullw(xmm_a, xmm_b, xmm_c);
  cc.vpmuludq(xmm_a, xmm_b, xmm_c);
  cc.vpor(xmm_a, xmm_b, xmm_c);
  cc.vpsadbw(xmm_a, xmm_b, xmm_c);
  cc.vpshufb(xmm_a, xmm_b, xmm_c);
  cc.vpshufd(xmm_a, xmm_b, 0);
  cc.vpshufhw(xmm_a, xmm_b, 0);
  cc.vpshuflw(xmm_a, xmm_b, 0);
  cc.vpsignb(xmm_a, xmm_b, xmm_c);
  cc.vpsignd(xmm_a, xmm_b, xmm_c);
  cc.vpsignw(xmm_a, xmm_b, xmm_c);
  cc.vpslld(xmm_a, xmm_b, xmm_c);
  cc.vpslld(xmm_a, xmm_b, 0);
  cc.vpslldq(xmm_a, xmm_b, 0);
  cc.vpsllq(xmm_a, xmm_b, xmm_c);
  cc.vpsllq(xmm_a, xmm_b, 0);
  cc.vpsllw(xmm_a, xmm_b, xmm_c);
  cc.vpsllw(xmm_a, xmm_b, 0);
  cc.vpsrad(xmm_a, xmm_b, xmm_c);
  cc.vpsrad(xmm_a, xmm_b, 0);
  cc.vpsraw(xmm_a, xmm_b, xmm_c);
  cc.vpsraw(xmm_a, xmm_b, 0);
  cc.vpsrld(xmm_a, xmm_b, xmm_c);
  cc.vpsrld(xmm_a, xmm_b, 0);
  cc.vpsrldq(xmm_a, xmm_b, 0);
  cc.vpsrlq(xmm_a, xmm_b, xmm_c);
  cc.vpsrlq(xmm_a, xmm_b, 0);
  cc.vpsrlw(xmm_a, xmm_b, xmm_c);
  cc.vpsrlw(xmm_a, xmm_b, 0);
  cc.vpsubb(xmm_a, xmm_b, xmm_c);
  cc.vpsubd(xmm_a, xmm_b, xmm_c);
  cc.vpsubq(xmm_a, xmm_b, xmm_c);
  cc.vpsubw(xmm_a, xmm_b, xmm_c);
  cc.vpsubsb(xmm_a, xmm_b, xmm_c);
  cc.vpsubsw(xmm_a, xmm_b, xmm_c);
  cc.vpsubusb(xmm_a, xmm_b, xmm_c);
  cc.vpsubusw(xmm_a, xmm_b, xmm_c);
  cc.vptest(xmm_a, xmm_b);
  cc.vptest(ymm_a, ymm_b);
  cc.vpunpckhbw(xmm_a, xmm_b, xmm_c);
  cc.vpunpckhdq(xmm_a, xmm_b, xmm_c);
  cc.vpunpckhqdq(xmm_a, xmm_b, xmm_c);
  cc.vpunpckhwd(xmm_a, xmm_b, xmm_c);
  cc.vpunpcklbw(xmm_a, xmm_b, xmm_c);
  cc.vpunpckldq(xmm_a, xmm_b, xmm_c);
  cc.vpunpcklqdq(xmm_a, xmm_b, xmm_c);
  cc.vpunpcklwd(xmm_a, xmm_b, xmm_c);
  cc.vpxor(xmm_a, xmm_b, xmm_c);
  cc.vrcpps(xmm_a, xmm_b);
  cc.vrcpps(ymm_a, ymm_b);
  cc.vrcpss(xmm_a, xmm_b, xmm_c);
  cc.vrsqrtps(xmm_a, xmm_b);
  cc.vrsqrtps(ymm_a, ymm_b);
  cc.vrsqrtss(xmm_a, xmm_b, xmm_c);
  cc.vroundpd(xmm_a, xmm_b, 0);
  cc.vroundpd(ymm_a, ymm_b, 0);
  cc.vroundps(xmm_a, xmm_b, 0);
  cc.vroundps(ymm_a, ymm_b, 0);
  cc.vroundsd(xmm_a, xmm_b, xmm_c, 0);
  cc.vroundss(xmm_a, xmm_b, xmm_c, 0);
  cc.vshufpd(xmm_a, xmm_b, xmm_c, 0);
  cc.vshufpd(ymm_a, ymm_b, ymm_c, 0);
  cc.vshufps(xmm_a, xmm_b, xmm_c, 0);
  cc.vshufps(ymm_a, ymm_b, ymm_c, 0);
  cc.vsqrtpd(xmm_a, xmm_b);
  cc.vsqrtpd(ymm_a, ymm_b);
  cc.vsqrtps(xmm_a, xmm_b);
  cc.vsqrtps(ymm_a, ymm_b);
  cc.vsqrtsd(xmm_a, xmm_b, xmm_c);
  cc.vsqrtss(xmm_a, xmm_b, xmm_c);
  cc.vsubpd(xmm_a, xmm_b, xmm_c);
  cc.vsubpd(ymm_a, ymm_b, ymm_c);
  cc.vsubps(xmm_a, xmm_b, xmm_c);
  cc.vsubps(ymm_a, ymm_b, ymm_c);
  cc.vsubsd(xmm_a, xmm_b, xmm_c);
  cc.vsubss(xmm_a, xmm_b, xmm_c);
  cc.vtestps(xmm_a, xmm_b);
  cc.vtestps(ymm_a, ymm_b);
  cc.vtestpd(xmm_a, xmm_b);
  cc.vtestpd(ymm_a, ymm_b);
  cc.vucomisd(xmm_a, xmm_b);
  cc.vucomiss(xmm_a, xmm_b);
  cc.vunpckhpd(xmm_a, xmm_b, xmm_c);
  cc.vunpckhpd(ymm_a, ymm_b, ymm_c);
  cc.vunpckhps(xmm_a, xmm_b, xmm_c);
  cc.vunpckhps(ymm_a, ymm_b, ymm_c);
  cc.vunpcklpd(xmm_a, xmm_b, xmm_c);
  cc.vunpcklpd(ymm_a, ymm_b, ymm_c);
  cc.vunpcklps(xmm_a, xmm_b, xmm_c);
  cc.vunpcklps(ymm_a, ymm_b, ymm_c);
  cc.vxorpd(xmm_a, xmm_b, xmm_c);
  cc.vxorpd(ymm_a, ymm_b, ymm_c);
  cc.vxorps(xmm_a, xmm_b, xmm_c);
  cc.vxorps(ymm_a, ymm_b, ymm_c);

  // AVX+AESNI.
  cc.vaesdec(xmm_a, xmm_b, xmm_c);
  cc.vaesdeclast(xmm_a, xmm_b, xmm_c);
  cc.vaesenc(xmm_a, xmm_b, xmm_c);
  cc.vaesenclast(xmm_a, xmm_b, xmm_c);
  cc.vaesimc(xmm_a, xmm_b);
  cc.vaeskeygenassist(xmm_a, xmm_b, 0);

  // AVX+PCLMULQDQ.
  cc.vpclmulqdq(xmm_a, xmm_b, xmm_c, 0);

  // AVX2.
  cc.vbroadcastsd(ymm_a, xmm_b);
  cc.vbroadcastss(xmm_a, xmm_b);
  cc.vbroadcastss(ymm_a, xmm_b);
  cc.vextracti128(xmm_a, ymm_b, 0);
  cc.vinserti128(ymm_a, ymm_b, xmm_c, 0);
  cc.vmpsadbw(ymm_a, ymm_b, ymm_c, 0);
  cc.vpabsb(ymm_a, ymm_b);
  cc.vpabsd(ymm_a, ymm_b);
  cc.vpabsw(ymm_a, ymm_b);
  cc.vpackssdw(ymm_a, ymm_b, ymm_c);
  cc.vpacksswb(ymm_a, ymm_b, ymm_c);
  cc.vpackusdw(ymm_a, ymm_b, ymm_c);
  cc.vpackuswb(ymm_a, ymm_b, ymm_c);
  cc.vpaddb(ymm_a, ymm_b, ymm_c);
  cc.vpaddd(ymm_a, ymm_b, ymm_c);
  cc.vpaddq(ymm_a, ymm_b, ymm_c);
  cc.vpaddw(ymm_a, ymm_b, ymm_c);
  cc.vpaddsb(ymm_a, ymm_b, ymm_c);
  cc.vpaddsw(ymm_a, ymm_b, ymm_c);
  cc.vpaddusb(ymm_a, ymm_b, ymm_c);
  cc.vpaddusw(ymm_a, ymm_b, ymm_c);
  cc.vpalignr(ymm_a, ymm_b, ymm_c, 0);
  cc.vpand(ymm_a, ymm_b, ymm_c);
  cc.vpandn(ymm_a, ymm_b, ymm_c);
  cc.vpavgb(ymm_a, ymm_b, ymm_c);
  cc.vpavgw(ymm_a, ymm_b, ymm_c);
  cc.vpblendd(xmm_a, xmm_b, xmm_c, 0);
  cc.vpblendd(ymm_a, ymm_b, ymm_c, 0);
  cc.vpblendvb(ymm_a, ymm_b, ymm_c, ymm_a);
  cc.vpblendw(ymm_a, ymm_b, ymm_c, 0);
  cc.vpbroadcastb(xmm_a, xmm_b);
  cc.vpbroadcastb(ymm_a, xmm_b);
  cc.vpbroadcastd(xmm_a, xmm_b);
  cc.vpbroadcastd(ymm_a, xmm_b);
  cc.vpbroadcastq(xmm_a, xmm_b);
  cc.vpbroadcastq(ymm_a, xmm_b);
  cc.vpbroadcastw(xmm_a, xmm_b);
  cc.vpbroadcastw(ymm_a, xmm_b);
  cc.vpcmpeqb(ymm_a, ymm_b, ymm_c);
  cc.vpcmpeqd(ymm_a, ymm_b, ymm_c);
  cc.vpcmpeqq(ymm_a, ymm_b, ymm_c);
  cc.vpcmpeqw(ymm_a, ymm_b, ymm_c);
  cc.vpcmpgtb(ymm_a, ymm_b, ymm_c);
  cc.vpcmpgtd(ymm_a, ymm_b, ymm_c);
  cc.vpcmpgtq(ymm_a, ymm_b, ymm_c);
  cc.vpcmpgtw(ymm_a, ymm_b, ymm_c);
  cc.vperm2i128(ymm_a, ymm_b, ymm_c, 0);
  cc.vpermd(ymm_a, ymm_b, ymm_c);
  cc.vpermps(ymm_a, ymm_b, ymm_c);
  cc.vpermpd(ymm_a, ymm_b, 0);
  cc.vpermq(ymm_a, ymm_b, 0);
  cc.vpmovmskb(gpd, ymm_b);
  cc.vpmovsxbd(ymm_a, xmm_b);
  cc.vpmovsxbq(ymm_a, xmm_b);
  cc.vpmovsxbw(ymm_a, xmm_b);
  cc.vpmovsxdq(ymm_a, xmm_b);
  cc.vpmovsxwd(ymm_a, xmm_b);
  cc.vpmovsxwq(ymm_a, xmm_b);
  cc.vpmovzxbd(ymm_a, xmm_b);
  cc.vpmovzxbq(ymm_a, xmm_b);
  cc.vpmovzxbw(ymm_a, xmm_b);
  cc.vpmovzxdq(ymm_a, xmm_b);
  cc.vpmovzxwd(ymm_a, xmm_b);
  cc.vpmovzxwq(ymm_a, xmm_b);
  cc.vpshufd(ymm_a, ymm_b, 0);
  cc.vpshufhw(ymm_a, ymm_b, 0);
  cc.vpshuflw(ymm_a, ymm_b, 0);
  cc.vpslld(ymm_a, ymm_b, 0);
  cc.vpslldq(ymm_a, ymm_b, 0);
  cc.vpsllq(ymm_a, ymm_b, 0);
  cc.vpsllw(ymm_a, ymm_b, 0);
  cc.vpsrad(ymm_a, ymm_b, 0);
  cc.vpsraw(ymm_a, ymm_b, 0);
  cc.vpsrld(ymm_a, ymm_b, 0);
  cc.vpsrldq(ymm_a, ymm_b, 0);
  cc.vpsrlq(ymm_a, ymm_b, 0);
  cc.vpsrlw(ymm_a, ymm_b, 0);
  cc.vphaddd(ymm_a, ymm_b, ymm_c);
  cc.vphaddsw(ymm_a, ymm_b, ymm_c);
  cc.vphaddw(ymm_a, ymm_b, ymm_c);
  cc.vphsubd(ymm_a, ymm_b, ymm_c);
  cc.vphsubsw(ymm_a, ymm_b, ymm_c);
  cc.vphsubw(ymm_a, ymm_b, ymm_c);
  cc.vpmaddubsw(ymm_a, ymm_b, ymm_c);
  cc.vpmaddwd(ymm_a, ymm_b, ymm_c);
  cc.vpmaxsb(ymm_a, ymm_b, ymm_c);
  cc.vpmaxsd(ymm_a, ymm_b, ymm_c);
  cc.vpmaxsw(ymm_a, ymm_b, ymm_c);
  cc.vpmaxub(ymm_a, ymm_b, ymm_c);
  cc.vpmaxud(ymm_a, ymm_b, ymm_c);
  cc.vpmaxuw(ymm_a, ymm_b, ymm_c);
  cc.vpminsb(ymm_a, ymm_b, ymm_c);
  cc.vpminsd(ymm_a, ymm_b, ymm_c);
  cc.vpminsw(ymm_a, ymm_b, ymm_c);
  cc.vpminub(ymm_a, ymm_b, ymm_c);
  cc.vpminud(ymm_a, ymm_b, ymm_c);
  cc.vpminuw(ymm_a, ymm_b, ymm_c);
  cc.vpmuldq(ymm_a, ymm_b, ymm_c);
  cc.vpmulhrsw(ymm_a, ymm_b, ymm_c);
  cc.vpmulhuw(ymm_a, ymm_b, ymm_c);
  cc.vpmulhw(ymm_a, ymm_b, ymm_c);
  cc.vpmulld(ymm_a, ymm_b, ymm_c);
  cc.vpmullw(ymm_a, ymm_b, ymm_c);
  cc.vpmuludq(ymm_a, ymm_b, ymm_c);
  cc.vpor(ymm_a, ymm_b, ymm_c);
  cc.vpsadbw(ymm_a, ymm_b, ymm_c);
  cc.vpshufb(ymm_a, ymm_b, ymm_c);
  cc.vpsignb(ymm_a, ymm_b, ymm_c);
  cc.vpsignd(ymm_a, ymm_b, ymm_c);
  cc.vpsignw(ymm_a, ymm_b, ymm_c);
  cc.vpslld(ymm_a, ymm_b, xmm_c);
  cc.vpsllq(ymm_a, ymm_b, xmm_c);
  cc.vpsllvd(xmm_a, xmm_b, xmm_c);
  cc.vpsllvd(ymm_a, ymm_b, ymm_c);
  cc.vpsllvq(xmm_a, xmm_b, xmm_c);
  cc.vpsllvq(ymm_a, ymm_b, ymm_c);
  cc.vpsllw(ymm_a, ymm_b, xmm_c);
  cc.vpsrad(ymm_a, ymm_b, xmm_c);
  cc.vpsravd(xmm_a, xmm_b, xmm_c);
  cc.vpsravd(ymm_a, ymm_b, ymm_c);
  cc.vpsraw(ymm_a, ymm_b, xmm_c);
  cc.vpsrld(ymm_a, ymm_b, xmm_c);
  cc.vpsrlq(ymm_a, ymm_b, xmm_c);
  cc.vpsrlvd(xmm_a, xmm_b, xmm_c);
  cc.vpsrlvd(ymm_a, ymm_b, ymm_c);
  cc.vpsrlvq(xmm_a, xmm_b, xmm_c);
  cc.vpsrlvq(ymm_a, ymm_b, ymm_c);
  cc.vpsrlw(ymm_a, ymm_b, xmm_c);
  cc.vpsubb(ymm_a, ymm_b, ymm_c);
  cc.vpsubd(ymm_a, ymm_b, ymm_c);
  cc.vpsubq(ymm_a, ymm_b, ymm_c);
  cc.vpsubsb(ymm_a, ymm_b, ymm_c);
  cc.vpsubsw(ymm_a, ymm_b, ymm_c);
  cc.vpsubusb(ymm_a, ymm_b, ymm_c);
  cc.vpsubusw(ymm_a, ymm_b, ymm_c);
  cc.vpsubw(ymm_a, ymm_b, ymm_c);
  cc.vpunpckhbw(ymm_a, ymm_b, ymm_c);
  cc.vpunpckhdq(ymm_a, ymm_b, ymm_c);
  cc.vpunpckhqdq(ymm_a, ymm_b, ymm_c);
  cc.vpunpckhwd(ymm_a, ymm_b, ymm_c);
  cc.vpunpcklbw(ymm_a, ymm_b, ymm_c);
  cc.vpunpckldq(ymm_a, ymm_b, ymm_c);
  cc.vpunpcklqdq(ymm_a, ymm_b, ymm_c);
  cc.vpunpcklwd(ymm_a, ymm_b, ymm_c);
  cc.vpxor(ymm_a, ymm_b, ymm_c);

  // FMA.
  cc.vfmadd132pd(xmm_a, xmm_b, xmm_c);
  cc.vfmadd132pd(ymm_a, ymm_b, ymm_c);
  cc.vfmadd132ps(xmm_a, xmm_b, xmm_c);
  cc.vfmadd132ps(ymm_a, ymm_b, ymm_c);
  cc.vfmadd132sd(xmm_a, xmm_b, xmm_c);
  cc.vfmadd132ss(xmm_a, xmm_b, xmm_c);
  cc.vfmadd213pd(xmm_a, xmm_b, xmm_c);
  cc.vfmadd213pd(ymm_a, ymm_b, ymm_c);
  cc.vfmadd213ps(xmm_a, xmm_b, xmm_c);
  cc.vfmadd213ps(ymm_a, ymm_b, ymm_c);
  cc.vfmadd213sd(xmm_a, xmm_b, xmm_c);
  cc.vfmadd213ss(xmm_a, xmm_b, xmm_c);
  cc.vfmadd231pd(xmm_a, xmm_b, xmm_c);
  cc.vfmadd231pd(ymm_a, ymm_b, ymm_c);
  cc.vfmadd231ps(xmm_a, xmm_b, xmm_c);
  cc.vfmadd231ps(ymm_a, ymm_b, ymm_c);
  cc.vfmadd231sd(xmm_a, xmm_b, xmm_c);
  cc.vfmadd231ss(xmm_a, xmm_b, xmm_c);
  cc.vfmaddsub132pd(xmm_a, xmm_b, xmm_c);
  cc.vfmaddsub132pd(ymm_a, ymm_b, ymm_c);
  cc.vfmaddsub132ps(xmm_a, xmm_b, xmm_c);
  cc.vfmaddsub132ps(ymm_a, ymm_b, ymm_c);
  cc.vfmaddsub213pd(xmm_a, xmm_b, xmm_c);
  cc.vfmaddsub213pd(ymm_a, ymm_b, ymm_c);
  cc.vfmaddsub213ps(xmm_a, xmm_b, xmm_c);
  cc.vfmaddsub213ps(ymm_a, ymm_b, ymm_c);
  cc.vfmaddsub231pd(xmm_a, xmm_b, xmm_c);
  cc.vfmaddsub231pd(ymm_a, ymm_b, ymm_c);
  cc.vfmaddsub231ps(xmm_a, xmm_b, xmm_c);
  cc.vfmaddsub231ps(ymm_a, ymm_b, ymm_c);
  cc.vfmsub132pd(xmm_a, xmm_b, xmm_c);
  cc.vfmsub132pd(ymm_a, ymm_b, ymm_c);
  cc.vfmsub132ps(xmm_a, xmm_b, xmm_c);
  cc.vfmsub132ps(ymm_a, ymm_b, ymm_c);
  cc.vfmsub132sd(xmm_a, xmm_b, xmm_c);
  cc.vfmsub132ss(xmm_a, xmm_b, xmm_c);
  cc.vfmsub213pd(xmm_a, xmm_b, xmm_c);
  cc.vfmsub213pd(ymm_a, ymm_b, ymm_c);
  cc.vfmsub213ps(xmm_a, xmm_b, xmm_c);
  cc.vfmsub213ps(ymm_a, ymm_b, ymm_c);
  cc.vfmsub213sd(xmm_a, xmm_b, xmm_c);
  cc.vfmsub213ss(xmm_a, xmm_b, xmm_c);
  cc.vfmsub231pd(xmm_a, xmm_b, xmm_c);
  cc.vfmsub231pd(ymm_a, ymm_b, ymm_c);
  cc.vfmsub231ps(xmm_a, xmm_b, xmm_c);
  cc.vfmsub231ps(ymm_a, ymm_b, ymm_c);
  cc.vfmsub231sd(xmm_a, xmm_b, xmm_c);
  cc.vfmsub231ss(xmm_a, xmm_b, xmm_c);
  cc.vfmsubadd132pd(xmm_a, xmm_b, xmm_c);
  cc.vfmsubadd132pd(ymm_a, ymm_b, ymm_c);
  cc.vfmsubadd132ps(xmm_a, xmm_b, xmm_c);
  cc.vfmsubadd132ps(ymm_a, ymm_b, ymm_c);
  cc.vfmsubadd213pd(xmm_a, xmm_b, xmm_c);
  cc.vfmsubadd213pd(ymm_a, ymm_b, ymm_c);
  cc.vfmsubadd213ps(xmm_a, xmm_b, xmm_c);
  cc.vfmsubadd213ps(ymm_a, ymm_b, ymm_c);
  cc.vfmsubadd231pd(xmm_a, xmm_b, xmm_c);
  cc.vfmsubadd231pd(ymm_a, ymm_b, ymm_c);
  cc.vfmsubadd231ps(xmm_a, xmm_b, xmm_c);
  cc.vfmsubadd231ps(ymm_a, ymm_b, ymm_c);
  cc.vfnmadd132pd(xmm_a, xmm_b, xmm_c);
  cc.vfnmadd132pd(ymm_a, ymm_b, ymm_c);
  cc.vfnmadd132ps(xmm_a, xmm_b, xmm_c);
  cc.vfnmadd132ps(ymm_a, ymm_b, ymm_c);
  cc.vfnmadd132sd(xmm_a, xmm_b, xmm_c);
  cc.vfnmadd132ss(xmm_a, xmm_b, xmm_c);
  cc.vfnmadd213pd(xmm_a, xmm_b, xmm_c);
  cc.vfnmadd213pd(ymm_a, ymm_b, ymm_c);
  cc.vfnmadd213ps(xmm_a, xmm_b, xmm_c);
  cc.vfnmadd213ps(ymm_a, ymm_b, ymm_c);
  cc.vfnmadd213sd(xmm_a, xmm_b, xmm_c);
  cc.vfnmadd213ss(xmm_a, xmm_b, xmm_c);
  cc.vfnmadd231pd(xmm_a, xmm_b, xmm_c);
  cc.vfnmadd231pd(ymm_a, ymm_b, ymm_c);
  cc.vfnmadd231ps(xmm_a, xmm_b, xmm_c);
  cc.vfnmadd231ps(ymm_a, ymm_b, ymm_c);
  cc.vfnmadd231sd(xmm_a, xmm_b, xmm_c);
  cc.vfnmadd231ss(xmm_a, xmm_b, xmm_c);
  cc.vfnmsub132pd(xmm_a, xmm_b, xmm_c);
  cc.vfnmsub132pd(ymm_a, ymm_b, ymm_c);
  cc.vfnmsub132ps(xmm_a, xmm_b, xmm_c);
  cc.vfnmsub132ps(ymm_a, ymm_b, ymm_c);
  cc.vfnmsub132sd(xmm_a, xmm_b, xmm_c);
  cc.vfnmsub132ss(xmm_a, xmm_b, xmm_c);
  cc.vfnmsub213pd(xmm_a, xmm_b, xmm_c);
  cc.vfnmsub213pd(ymm_a, ymm_b, ymm_c);
  cc.vfnmsub213ps(xmm_a, xmm_b, xmm_c);
  cc.vfnmsub213ps(ymm_a, ymm_b, ymm_c);
  cc.vfnmsub213sd(xmm_a, xmm_b, xmm_c);
  cc.vfnmsub213ss(xmm_a, xmm_b, xmm_c);
  cc.vfnmsub231pd(xmm_a, xmm_b, xmm_c);
  cc.vfnmsub231pd(ymm_a, ymm_b, ymm_c);
  cc.vfnmsub231ps(xmm_a, xmm_b, xmm_c);
  cc.vfnmsub231ps(ymm_a, ymm_b, ymm_c);
  cc.vfnmsub231sd(xmm_a, xmm_b, xmm_c);
  cc.vfnmsub231ss(xmm_a, xmm_b, xmm_c);
}

// Generates a long sequence of AVX instructions.
template<typename Emitter>
static void generate_avx_sequenceInternalRegMem(
  Emitter& cc,
  const x86::Gp& gp,
  const x86::Vec& vec_a, const x86::Vec& vec_b, const x86::Vec& vec_c, const x86::Vec& vec_d) {

  x86::Gp gpd = gp.r32();
  x86::Gp gpq = gp.r64();
  x86::Gp gpz = cc.is_32bit() ? gpd : gpq;

  x86::Vec xmm_a = vec_a.xmm();
  x86::Vec xmm_b = vec_b.xmm();
  x86::Vec xmm_c = vec_c.xmm();
  x86::Vec xmm_d = vec_d.xmm();

  x86::Vec ymm_a = vec_a.ymm();
  x86::Vec ymm_b = vec_b.ymm();
  x86::Vec ymm_c = vec_c.ymm();
  x86::Vec ymm_d = vec_d.ymm();

  x86::Mem m = x86::ptr(gpz);
  x86::Mem m128 = x86::xmmword_ptr(gpz);
  x86::Mem m256 = x86::xmmword_ptr(gpz);
  x86::Mem vx_ptr = x86::ptr(gpz, xmm_d);
  x86::Mem vy_ptr = x86::ptr(gpz, ymm_d);

  cc.xor_(gpd, gpd);
  cc.vxorps(xmm_a, xmm_a, xmm_a);
  cc.vxorps(xmm_b, xmm_b, xmm_b);
  cc.vxorps(xmm_c, xmm_c, xmm_c);
  cc.vxorps(xmm_d, xmm_d, xmm_d);

  cc.vaddpd(xmm_a, xmm_b, m);
  cc.vaddpd(ymm_a, ymm_b, m);
  cc.vaddps(xmm_a, xmm_b, m);
  cc.vaddps(ymm_a, ymm_b, m);
  cc.vaddsd(xmm_a, xmm_b, m);
  cc.vaddss(xmm_a, xmm_b, m);
  cc.vaddsubpd(xmm_a, xmm_b, m);
  cc.vaddsubpd(ymm_a, ymm_b, m);
  cc.vaddsubps(xmm_a, xmm_b, m);
  cc.vaddsubps(ymm_a, ymm_b, m);
  cc.vandpd(xmm_a, xmm_b, m);
  cc.vandpd(ymm_a, ymm_b, m);
  cc.vandps(xmm_a, xmm_b, m);
  cc.vandps(ymm_a, ymm_b, m);
  cc.vandnpd(xmm_a, xmm_b, m);
  cc.vandnpd(ymm_a, ymm_b, m);
  cc.vandnps(xmm_a, xmm_b, m);
  cc.vandnps(ymm_a, ymm_b, m);
  cc.vblendpd(xmm_a, xmm_b, m, 0);
  cc.vblendpd(ymm_a, ymm_b, m, 0);
  cc.vblendps(xmm_a, xmm_b, m, 0);
  cc.vblendps(ymm_a, ymm_b, m, 0);
  cc.vblendvpd(xmm_a, xmm_b, m, xmm_a);
  cc.vblendvpd(ymm_a, ymm_b, m, ymm_a);
  cc.vbroadcastf128(ymm_a, m);
  cc.vbroadcastsd(ymm_a, m);
  cc.vbroadcastss(xmm_a, m);
  cc.vbroadcastss(ymm_a, m);
  cc.vcmppd(xmm_a, xmm_b, m, 0);
  cc.vcmppd(ymm_a, ymm_b, m, 0);
  cc.vcmpps(xmm_a, xmm_b, m, 0);
  cc.vcmpps(ymm_a, ymm_b, m, 0);
  cc.vcmpsd(xmm_a, xmm_b, m, 0);
  cc.vcmpss(xmm_a, xmm_b, m, 0);
  cc.vcomisd(xmm_a, m);
  cc.vcomiss(xmm_a, m);
  cc.vcvtdq2pd(xmm_a, m);
  cc.vcvtdq2pd(ymm_a, m);
  cc.vcvtdq2ps(xmm_a, m);
  cc.vcvtdq2ps(ymm_a, m);
  cc.vcvtpd2dq(xmm_a, m128);
  cc.vcvtpd2dq(xmm_a, m256);
  cc.vcvtpd2ps(xmm_a, m128);
  cc.vcvtpd2ps(xmm_a, m256);
  cc.vcvtps2dq(xmm_a, m);
  cc.vcvtps2dq(ymm_a, m);
  cc.vcvtps2pd(xmm_a, m);
  cc.vcvtps2pd(ymm_a, m);
  cc.vcvtsd2si(gpd, m);
  cc.vcvtsd2ss(xmm_a, xmm_b, m);
  cc.vcvtsi2sd(xmm_a, xmm_b, m);
  cc.vcvtsi2ss(xmm_a, xmm_b, m);
  cc.vcvtss2sd(xmm_a, xmm_b, m);
  cc.vcvtss2si(gpd, m);
  cc.vcvttpd2dq(xmm_a, m128);
  cc.vcvttpd2dq(xmm_a, m256);
  cc.vcvttps2dq(xmm_a, m);
  cc.vcvttps2dq(ymm_a, m);
  cc.vcvttsd2si(gpd, m);
  cc.vcvttss2si(gpd, m);
  cc.vdivpd(xmm_a, xmm_b, m);
  cc.vdivpd(ymm_a, ymm_b, m);
  cc.vdivps(xmm_a, xmm_b, m);
  cc.vdivps(ymm_a, ymm_b, m);
  cc.vdivsd(xmm_a, xmm_b, m);
  cc.vdivss(xmm_a, xmm_b, m);
  cc.vdppd(xmm_a, xmm_b, m, 0);
  cc.vdpps(xmm_a, xmm_b, m, 0);
  cc.vdpps(ymm_a, ymm_b, m, 0);
  cc.vextractf128(m, ymm_b, 0);
  cc.vextractps(m, xmm_b, 0);
  cc.vhaddpd(xmm_a, xmm_b, m);
  cc.vhaddpd(ymm_a, ymm_b, m);
  cc.vhaddps(xmm_a, xmm_b, m);
  cc.vhaddps(ymm_a, ymm_b, m);
  cc.vhsubpd(xmm_a, xmm_b, m);
  cc.vhsubpd(ymm_a, ymm_b, m);
  cc.vhsubps(xmm_a, xmm_b, m);
  cc.vhsubps(ymm_a, ymm_b, m);
  cc.vinsertf128(ymm_a, ymm_b, m, 0);
  cc.vinsertps(xmm_a, xmm_b, m, 0);
  cc.vlddqu(xmm_a, m);
  cc.vlddqu(ymm_a, m);
  cc.vmaskmovps(xmm_a, xmm_b, m);
  cc.vmaskmovps(ymm_a, ymm_b, m);
  cc.vmaskmovps(m, xmm_b, xmm_c);
  cc.vmaskmovps(m, ymm_b, ymm_c);
  cc.vmaskmovpd(xmm_a, xmm_b, m);
  cc.vmaskmovpd(ymm_a, ymm_b, m);
  cc.vmaskmovpd(m, xmm_b, xmm_c);
  cc.vmaskmovpd(m, ymm_b, ymm_c);
  cc.vmaxpd(xmm_a, xmm_b, m);
  cc.vmaxpd(ymm_a, ymm_b, m);
  cc.vmaxps(xmm_a, xmm_b, m);
  cc.vmaxps(ymm_a, ymm_b, m);
  cc.vmaxsd(xmm_a, xmm_b, m);
  cc.vmaxss(xmm_a, xmm_b, m);
  cc.vminpd(xmm_a, xmm_b, m);
  cc.vminpd(ymm_a, ymm_b, m);
  cc.vminps(xmm_a, xmm_b, m);
  cc.vminps(ymm_a, ymm_b, m);
  cc.vminsd(xmm_a, xmm_b, m);
  cc.vminss(xmm_a, xmm_b, m);
  cc.vmovapd(xmm_a, m);
  cc.vmovapd(m, xmm_b);
  cc.vmovapd(ymm_a, m);
  cc.vmovapd(m, ymm_b);
  cc.vmovaps(xmm_a, m);
  cc.vmovaps(m, xmm_b);
  cc.vmovaps(ymm_a, m);
  cc.vmovaps(m, ymm_b);
  cc.vmovd(xmm_a, m);
  cc.vmovd(m, xmm_b);
  cc.vmovddup(xmm_a, m);
  cc.vmovddup(ymm_a, m);
  cc.vmovdqa(xmm_a, m);
  cc.vmovdqa(m, xmm_b);
  cc.vmovdqa(ymm_a, m);
  cc.vmovdqa(m, ymm_b);
  cc.vmovdqu(xmm_a, m);
  cc.vmovdqu(m, xmm_b);
  cc.vmovdqu(ymm_a, m);
  cc.vmovdqu(m, ymm_b);
  cc.vmovhpd(xmm_a, xmm_b, m);
  cc.vmovhps(xmm_a, xmm_b, m);
  cc.vmovhps(m, xmm_b);
  cc.vmovlpd(xmm_a, xmm_b, m);
  cc.vmovlpd(m, xmm_b);
  cc.vmovlps(xmm_a, xmm_b, m);
  cc.vmovlps(m, xmm_b);
  cc.vmovntdq(m, xmm_b);
  cc.vmovntdq(m, ymm_b);
  cc.vmovntdqa(xmm_a, m);
  cc.vmovntpd(m, xmm_b);
  cc.vmovntpd(m, ymm_b);
  cc.vmovntps(m, xmm_b);
  cc.vmovntps(m, ymm_b);
  cc.vmovsd(xmm_a, m);
  cc.vmovsd(m, xmm_b);
  cc.vmovshdup(xmm_a, m);
  cc.vmovshdup(ymm_a, m);
  cc.vmovsldup(xmm_a, m);
  cc.vmovsldup(ymm_a, m);
  cc.vmovss(xmm_a, m);
  cc.vmovss(m, xmm_b);
  cc.vmovupd(xmm_a, m);
  cc.vmovupd(m, xmm_b);
  cc.vmovupd(ymm_a, m);
  cc.vmovupd(m, ymm_b);
  cc.vmovups(xmm_a, m);
  cc.vmovups(m, xmm_b);
  cc.vmovups(ymm_a, m);
  cc.vmovups(m, ymm_b);
  cc.vmpsadbw(xmm_a, xmm_b, m, 0);
  cc.vmulpd(xmm_a, xmm_b, m);
  cc.vmulpd(ymm_a, ymm_b, m);
  cc.vmulps(xmm_a, xmm_b, m);
  cc.vmulps(ymm_a, ymm_b, m);
  cc.vmulsd(xmm_a, xmm_b, m);
  cc.vmulss(xmm_a, xmm_b, m);
  cc.vorpd(xmm_a, xmm_b, m);
  cc.vorpd(ymm_a, ymm_b, m);
  cc.vorps(xmm_a, xmm_b, m);
  cc.vorps(ymm_a, ymm_b, m);
  cc.vpabsb(xmm_a, m);
  cc.vpabsd(xmm_a, m);
  cc.vpabsw(xmm_a, m);
  cc.vpackssdw(xmm_a, xmm_b, m);
  cc.vpacksswb(xmm_a, xmm_b, m);
  cc.vpackusdw(xmm_a, xmm_b, m);
  cc.vpackuswb(xmm_a, xmm_b, m);
  cc.vpaddb(xmm_a, xmm_b, m);
  cc.vpaddd(xmm_a, xmm_b, m);
  cc.vpaddq(xmm_a, xmm_b, m);
  cc.vpaddw(xmm_a, xmm_b, m);
  cc.vpaddsb(xmm_a, xmm_b, m);
  cc.vpaddsw(xmm_a, xmm_b, m);
  cc.vpaddusb(xmm_a, xmm_b, m);
  cc.vpaddusw(xmm_a, xmm_b, m);
  cc.vpalignr(xmm_a, xmm_b, m, 0);
  cc.vpand(xmm_a, xmm_b, m);
  cc.vpandn(xmm_a, xmm_b, m);
  cc.vpavgb(xmm_a, xmm_b, m);
  cc.vpavgw(xmm_a, xmm_b, m);
  cc.vpblendvb(xmm_a, xmm_b, m, xmm_a);
  cc.vpblendw(xmm_a, xmm_b, m, 0);
  cc.vpcmpeqb(xmm_a, xmm_b, m);
  cc.vpcmpeqd(xmm_a, xmm_b, m);
  cc.vpcmpeqq(xmm_a, xmm_b, m);
  cc.vpcmpeqw(xmm_a, xmm_b, m);
  cc.vpcmpgtb(xmm_a, xmm_b, m);
  cc.vpcmpgtd(xmm_a, xmm_b, m);
  cc.vpcmpgtq(xmm_a, xmm_b, m);
  cc.vpcmpgtw(xmm_a, xmm_b, m);
  cc.vpermilpd(xmm_a, xmm_b, m);
  cc.vpermilpd(ymm_a, ymm_b, m);
  cc.vpermilpd(xmm_a, m, 0);
  cc.vpermilpd(ymm_a, m, 0);
  cc.vpermilps(xmm_a, xmm_b, m);
  cc.vpermilps(ymm_a, ymm_b, m);
  cc.vpermilps(xmm_a, m, 0);
  cc.vpermilps(ymm_a, m, 0);
  cc.vperm2f128(ymm_a, ymm_b, m, 0);
  cc.vpextrb(m, xmm_b, 0);
  cc.vpextrd(m, xmm_b, 0);
  if (cc.is_64bit()) cc.vpextrq(m, xmm_b, 0);
  cc.vpextrw(m, xmm_b, 0);
  cc.vphaddd(xmm_a, xmm_b, m);
  cc.vphaddsw(xmm_a, xmm_b, m);
  cc.vphaddw(xmm_a, xmm_b, m);
  cc.vphminposuw(xmm_a, m);
  cc.vphsubd(xmm_a, xmm_b, m);
  cc.vphsubsw(xmm_a, xmm_b, m);
  cc.vphsubw(xmm_a, xmm_b, m);
  cc.vpinsrb(xmm_a, xmm_b, m, 0);
  cc.vpinsrd(xmm_a, xmm_b, m, 0);
  cc.vpinsrw(xmm_a, xmm_b, m, 0);
  cc.vpmaddubsw(xmm_a, xmm_b, m);
  cc.vpmaddwd(xmm_a, xmm_b, m);
  cc.vpmaxsb(xmm_a, xmm_b, m);
  cc.vpmaxsd(xmm_a, xmm_b, m);
  cc.vpmaxsw(xmm_a, xmm_b, m);
  cc.vpmaxub(xmm_a, xmm_b, m);
  cc.vpmaxud(xmm_a, xmm_b, m);
  cc.vpmaxuw(xmm_a, xmm_b, m);
  cc.vpminsb(xmm_a, xmm_b, m);
  cc.vpminsd(xmm_a, xmm_b, m);
  cc.vpminsw(xmm_a, xmm_b, m);
  cc.vpminub(xmm_a, xmm_b, m);
  cc.vpminud(xmm_a, xmm_b, m);
  cc.vpminuw(xmm_a, xmm_b, m);
  cc.vpmovsxbd(xmm_a, m);
  cc.vpmovsxbq(xmm_a, m);
  cc.vpmovsxbw(xmm_a, m);
  cc.vpmovsxdq(xmm_a, m);
  cc.vpmovsxwd(xmm_a, m);
  cc.vpmovsxwq(xmm_a, m);
  cc.vpmovzxbd(xmm_a, m);
  cc.vpmovzxbq(xmm_a, m);
  cc.vpmovzxbw(xmm_a, m);
  cc.vpmovzxdq(xmm_a, m);
  cc.vpmovzxwd(xmm_a, m);
  cc.vpmovzxwq(xmm_a, m);
  cc.vpmuldq(xmm_a, xmm_b, m);
  cc.vpmulhrsw(xmm_a, xmm_b, m);
  cc.vpmulhuw(xmm_a, xmm_b, m);
  cc.vpmulhw(xmm_a, xmm_b, m);
  cc.vpmulld(xmm_a, xmm_b, m);
  cc.vpmullw(xmm_a, xmm_b, m);
  cc.vpmuludq(xmm_a, xmm_b, m);
  cc.vpor(xmm_a, xmm_b, m);
  cc.vpsadbw(xmm_a, xmm_b, m);
  cc.vpshufb(xmm_a, xmm_b, m);
  cc.vpshufd(xmm_a, m, 0);
  cc.vpshufhw(xmm_a, m, 0);
  cc.vpshuflw(xmm_a, m, 0);
  cc.vpsignb(xmm_a, xmm_b, m);
  cc.vpsignd(xmm_a, xmm_b, m);
  cc.vpsignw(xmm_a, xmm_b, m);
  cc.vpslld(xmm_a, xmm_b, m);
  cc.vpsllq(xmm_a, xmm_b, m);
  cc.vpsllw(xmm_a, xmm_b, m);
  cc.vpsrad(xmm_a, xmm_b, m);
  cc.vpsraw(xmm_a, xmm_b, m);
  cc.vpsrld(xmm_a, xmm_b, m);
  cc.vpsrlq(xmm_a, xmm_b, m);
  cc.vpsrlw(xmm_a, xmm_b, m);
  cc.vpsubb(xmm_a, xmm_b, m);
  cc.vpsubd(xmm_a, xmm_b, m);
  cc.vpsubq(xmm_a, xmm_b, m);
  cc.vpsubw(xmm_a, xmm_b, m);
  cc.vpsubsb(xmm_a, xmm_b, m);
  cc.vpsubsw(xmm_a, xmm_b, m);
  cc.vpsubusb(xmm_a, xmm_b, m);
  cc.vpsubusw(xmm_a, xmm_b, m);
  cc.vptest(xmm_a, m);
  cc.vptest(ymm_a, m);
  cc.vpunpckhbw(xmm_a, xmm_b, m);
  cc.vpunpckhdq(xmm_a, xmm_b, m);
  cc.vpunpckhqdq(xmm_a, xmm_b, m);
  cc.vpunpckhwd(xmm_a, xmm_b, m);
  cc.vpunpcklbw(xmm_a, xmm_b, m);
  cc.vpunpckldq(xmm_a, xmm_b, m);
  cc.vpunpcklqdq(xmm_a, xmm_b, m);
  cc.vpunpcklwd(xmm_a, xmm_b, m);
  cc.vpxor(xmm_a, xmm_b, m);
  cc.vrcpps(xmm_a, m);
  cc.vrcpps(ymm_a, m);
  cc.vrcpss(xmm_a, xmm_b, m);
  cc.vrsqrtps(xmm_a, m);
  cc.vrsqrtps(ymm_a, m);
  cc.vrsqrtss(xmm_a, xmm_b, m);
  cc.vroundpd(xmm_a, m, 0);
  cc.vroundpd(ymm_a, m, 0);
  cc.vroundps(xmm_a, m, 0);
  cc.vroundps(ymm_a, m, 0);
  cc.vroundsd(xmm_a, xmm_b, m, 0);
  cc.vroundss(xmm_a, xmm_b, m, 0);
  cc.vshufpd(xmm_a, xmm_b, m, 0);
  cc.vshufpd(ymm_a, ymm_b, m, 0);
  cc.vshufps(xmm_a, xmm_b, m, 0);
  cc.vshufps(ymm_a, ymm_b, m, 0);
  cc.vsqrtpd(xmm_a, m);
  cc.vsqrtpd(ymm_a, m);
  cc.vsqrtps(xmm_a, m);
  cc.vsqrtps(ymm_a, m);
  cc.vsqrtsd(xmm_a, xmm_b, m);
  cc.vsqrtss(xmm_a, xmm_b, m);
  cc.vsubpd(xmm_a, xmm_b, m);
  cc.vsubpd(ymm_a, ymm_b, m);
  cc.vsubps(xmm_a, xmm_b, m);
  cc.vsubps(ymm_a, ymm_b, m);
  cc.vsubsd(xmm_a, xmm_b, m);
  cc.vsubss(xmm_a, xmm_b, m);
  cc.vtestps(xmm_a, m);
  cc.vtestps(ymm_a, m);
  cc.vtestpd(xmm_a, m);
  cc.vtestpd(ymm_a, m);
  cc.vucomisd(xmm_a, m);
  cc.vucomiss(xmm_a, m);
  cc.vunpckhpd(xmm_a, xmm_b, m);
  cc.vunpckhpd(ymm_a, ymm_b, m);
  cc.vunpckhps(xmm_a, xmm_b, m);
  cc.vunpckhps(ymm_a, ymm_b, m);
  cc.vunpcklpd(xmm_a, xmm_b, m);
  cc.vunpcklpd(ymm_a, ymm_b, m);
  cc.vunpcklps(xmm_a, xmm_b, m);
  cc.vunpcklps(ymm_a, ymm_b, m);
  cc.vxorpd(xmm_a, xmm_b, m);
  cc.vxorpd(ymm_a, ymm_b, m);
  cc.vxorps(xmm_a, xmm_b, m);
  cc.vxorps(ymm_a, ymm_b, m);

  // AVX+AESNI.
  cc.vaesdec(xmm_a, xmm_b, m);
  cc.vaesdeclast(xmm_a, xmm_b, m);
  cc.vaesenc(xmm_a, xmm_b, m);
  cc.vaesenclast(xmm_a, xmm_b, m);
  cc.vaesimc(xmm_a, m);
  cc.vaeskeygenassist(xmm_a, m, 0);

  // AVX+PCLMULQDQ.
  cc.vpclmulqdq(xmm_a, xmm_b, m, 0);

  // AVX2.
  cc.vbroadcasti128(ymm_a, m);
  cc.vextracti128(m, ymm_b, 0);
  cc.vgatherdpd(xmm_a, vx_ptr, xmm_c);
  cc.vgatherdpd(ymm_a, vx_ptr, ymm_c);
  cc.vgatherdps(xmm_a, vx_ptr, xmm_c);
  cc.vgatherdps(ymm_a, vy_ptr, ymm_c);
  cc.vgatherqpd(xmm_a, vx_ptr, xmm_c);
  cc.vgatherqpd(ymm_a, vy_ptr, ymm_c);
  cc.vgatherqps(xmm_a, vx_ptr, xmm_c);
  cc.vgatherqps(xmm_a, vy_ptr, xmm_c);
  cc.vinserti128(ymm_a, ymm_b, m, 0);
  cc.vmovntdqa(ymm_a, m);
  cc.vmpsadbw(ymm_a, ymm_b, m, 0);
  cc.vpabsb(ymm_a, m);
  cc.vpabsd(ymm_a, m);
  cc.vpabsw(ymm_a, m);
  cc.vpackssdw(ymm_a, ymm_b, m);
  cc.vpacksswb(ymm_a, ymm_b, m);
  cc.vpackusdw(ymm_a, ymm_b, m);
  cc.vpackuswb(ymm_a, ymm_b, m);
  cc.vpaddb(ymm_a, ymm_b, m);
  cc.vpaddd(ymm_a, ymm_b, m);
  cc.vpaddq(ymm_a, ymm_b, m);
  cc.vpaddw(ymm_a, ymm_b, m);
  cc.vpaddsb(ymm_a, ymm_b, m);
  cc.vpaddsw(ymm_a, ymm_b, m);
  cc.vpaddusb(ymm_a, ymm_b, m);
  cc.vpaddusw(ymm_a, ymm_b, m);
  cc.vpalignr(ymm_a, ymm_b, m, 0);
  cc.vpand(ymm_a, ymm_b, m);
  cc.vpandn(ymm_a, ymm_b, m);
  cc.vpavgb(ymm_a, ymm_b, m);
  cc.vpavgw(ymm_a, ymm_b, m);
  cc.vpblendd(xmm_a, xmm_b, m, 0);
  cc.vpblendd(ymm_a, ymm_b, m, 0);
  cc.vpblendvb(ymm_a, ymm_b, m, ymm_a);
  cc.vpblendw(ymm_a, ymm_b, m, 0);
  cc.vpbroadcastb(xmm_a, m);
  cc.vpbroadcastb(ymm_a, m);
  cc.vpbroadcastd(xmm_a, m);
  cc.vpbroadcastd(ymm_a, m);
  cc.vpbroadcastq(xmm_a, m);
  cc.vpbroadcastq(ymm_a, m);
  cc.vpbroadcastw(xmm_a, m);
  cc.vpbroadcastw(ymm_a, m);
  cc.vpcmpeqb(ymm_a, ymm_b, m);
  cc.vpcmpeqd(ymm_a, ymm_b, m);
  cc.vpcmpeqq(ymm_a, ymm_b, m);
  cc.vpcmpeqw(ymm_a, ymm_b, m);
  cc.vpcmpgtb(ymm_a, ymm_b, m);
  cc.vpcmpgtd(ymm_a, ymm_b, m);
  cc.vpcmpgtq(ymm_a, ymm_b, m);
  cc.vpcmpgtw(ymm_a, ymm_b, m);
  cc.vperm2i128(ymm_a, ymm_b, m, 0);
  cc.vpermd(ymm_a, ymm_b, m);
  cc.vpermps(ymm_a, ymm_b, m);
  cc.vpermpd(ymm_a, m, 0);
  cc.vpermq(ymm_a, m, 0);
  cc.vpgatherdd(xmm_a, vx_ptr, xmm_c);
  cc.vpgatherdd(ymm_a, vy_ptr, ymm_c);
  cc.vpgatherdq(xmm_a, vx_ptr, xmm_c);
  cc.vpgatherdq(ymm_a, vx_ptr, ymm_c);
  cc.vpgatherqd(xmm_a, vx_ptr, xmm_c);
  cc.vpgatherqd(xmm_a, vy_ptr, xmm_c);
  cc.vpgatherqq(xmm_a, vx_ptr, xmm_c);
  cc.vpgatherqq(ymm_a, vy_ptr, ymm_c);
  cc.vpmovsxbd(ymm_a, m);
  cc.vpmovsxbq(ymm_a, m);
  cc.vpmovsxbw(ymm_a, m);
  cc.vpmovsxdq(ymm_a, m);
  cc.vpmovsxwd(ymm_a, m);
  cc.vpmovsxwq(ymm_a, m);
  cc.vpmovzxbd(ymm_a, m);
  cc.vpmovzxbq(ymm_a, m);
  cc.vpmovzxbw(ymm_a, m);
  cc.vpmovzxdq(ymm_a, m);
  cc.vpmovzxwd(ymm_a, m);
  cc.vpmovzxwq(ymm_a, m);
  cc.vpshufd(ymm_a, m, 0);
  cc.vpshufhw(ymm_a, m, 0);
  cc.vpshuflw(ymm_a, m, 0);
  cc.vphaddd(ymm_a, ymm_b, m);
  cc.vphaddsw(ymm_a, ymm_b, m);
  cc.vphaddw(ymm_a, ymm_b, m);
  cc.vphsubd(ymm_a, ymm_b, m);
  cc.vphsubsw(ymm_a, ymm_b, m);
  cc.vphsubw(ymm_a, ymm_b, m);
  cc.vpmaddubsw(ymm_a, ymm_b, m);
  cc.vpmaddwd(ymm_a, ymm_b, m);
  cc.vpmaskmovd(m, xmm_b, xmm_c);
  cc.vpmaskmovd(m, ymm_b, ymm_c);
  cc.vpmaskmovd(xmm_a, xmm_b, m);
  cc.vpmaskmovd(ymm_a, ymm_b, m);
  cc.vpmaskmovq(m, xmm_b, xmm_c);
  cc.vpmaskmovq(m, ymm_b, ymm_c);
  cc.vpmaskmovq(xmm_a, xmm_b, m);
  cc.vpmaskmovq(ymm_a, ymm_b, m);
  cc.vpmaxsb(ymm_a, ymm_b, m);
  cc.vpmaxsd(ymm_a, ymm_b, m);
  cc.vpmaxsw(ymm_a, ymm_b, m);
  cc.vpmaxub(ymm_a, ymm_b, m);
  cc.vpmaxud(ymm_a, ymm_b, m);
  cc.vpmaxuw(ymm_a, ymm_b, m);
  cc.vpminsb(ymm_a, ymm_b, m);
  cc.vpminsd(ymm_a, ymm_b, m);
  cc.vpminsw(ymm_a, ymm_b, m);
  cc.vpminub(ymm_a, ymm_b, m);
  cc.vpminud(ymm_a, ymm_b, m);
  cc.vpminuw(ymm_a, ymm_b, m);
  cc.vpmuldq(ymm_a, ymm_b, m);
  cc.vpmulhrsw(ymm_a, ymm_b, m);
  cc.vpmulhuw(ymm_a, ymm_b, m);
  cc.vpmulhw(ymm_a, ymm_b, m);
  cc.vpmulld(ymm_a, ymm_b, m);
  cc.vpmullw(ymm_a, ymm_b, m);
  cc.vpmuludq(ymm_a, ymm_b, m);
  cc.vpor(ymm_a, ymm_b, m);
  cc.vpsadbw(ymm_a, ymm_b, m);
  cc.vpshufb(ymm_a, ymm_b, m);
  cc.vpsignb(ymm_a, ymm_b, m);
  cc.vpsignd(ymm_a, ymm_b, m);
  cc.vpsignw(ymm_a, ymm_b, m);
  cc.vpslld(ymm_a, ymm_b, m);
  cc.vpsllq(ymm_a, ymm_b, m);
  cc.vpsllvd(xmm_a, xmm_b, m);
  cc.vpsllvd(ymm_a, ymm_b, m);
  cc.vpsllvq(xmm_a, xmm_b, m);
  cc.vpsllvq(ymm_a, ymm_b, m);
  cc.vpsllw(ymm_a, ymm_b, m);
  cc.vpsrad(ymm_a, ymm_b, m);
  cc.vpsravd(xmm_a, xmm_b, m);
  cc.vpsravd(ymm_a, ymm_b, m);
  cc.vpsraw(ymm_a, ymm_b, m);
  cc.vpsrld(ymm_a, ymm_b, m);
  cc.vpsrlq(ymm_a, ymm_b, m);
  cc.vpsrlvd(xmm_a, xmm_b, m);
  cc.vpsrlvd(ymm_a, ymm_b, m);
  cc.vpsrlvq(xmm_a, xmm_b, m);
  cc.vpsrlvq(ymm_a, ymm_b, m);
  cc.vpsrlw(ymm_a, ymm_b, m);
  cc.vpsubb(ymm_a, ymm_b, m);
  cc.vpsubd(ymm_a, ymm_b, m);
  cc.vpsubq(ymm_a, ymm_b, m);
  cc.vpsubsb(ymm_a, ymm_b, m);
  cc.vpsubsw(ymm_a, ymm_b, m);
  cc.vpsubusb(ymm_a, ymm_b, m);
  cc.vpsubusw(ymm_a, ymm_b, m);
  cc.vpsubw(ymm_a, ymm_b, m);
  cc.vpunpckhbw(ymm_a, ymm_b, m);
  cc.vpunpckhdq(ymm_a, ymm_b, m);
  cc.vpunpckhqdq(ymm_a, ymm_b, m);
  cc.vpunpckhwd(ymm_a, ymm_b, m);
  cc.vpunpcklbw(ymm_a, ymm_b, m);
  cc.vpunpckldq(ymm_a, ymm_b, m);
  cc.vpunpcklqdq(ymm_a, ymm_b, m);
  cc.vpunpcklwd(ymm_a, ymm_b, m);
  cc.vpxor(ymm_a, ymm_b, m);
}

// Generates a long sequence of AVX instructions.
template<typename Emitter>
static void generate_avx_sequenceInternal(
  Emitter& cc,
  InstForm form,
  const x86::Gp& gp,
  const x86::Vec& vec_a, const x86::Vec& vec_b, const x86::Vec& vec_c, const x86::Vec& vec_d) {

  if (form == InstForm::kReg)
    generate_avx_sequenceInternalRegOnly(cc, gp, vec_a, vec_b, vec_c, vec_d);
  else
    generate_avx_sequenceInternalRegMem(cc, gp, vec_a, vec_b, vec_c, vec_d);
}

static void generate_avx_sequence(BaseEmitter& emitter, InstForm form, bool emit_prolog_epilog) {
  using namespace asmjit::x86;

#ifndef ASMJIT_NO_COMPILER
  if (emitter.is_compiler()) {
    Compiler& cc = *emitter.as<Compiler>();

    Gp gp = cc.new_gpz("gp");
    x86::Vec a = cc.new_ymm("a");
    x86::Vec b = cc.new_ymm("b");
    x86::Vec c = cc.new_ymm("c");
    x86::Vec d = cc.new_ymm("d");

    cc.add_func(FuncSignature::build<void>());
    generate_avx_sequenceInternal(cc, form, gp, a, b, c, d);
    cc.end_func();

    return;
  }
#endif

#ifndef ASMJIT_NO_BUILDER
  if (emitter.is_builder()) {
    Builder& cc = *emitter.as<Builder>();

    if (emit_prolog_epilog) {
      FuncDetail func;
      func.init(FuncSignature::build<void, void*, const void*, size_t>(), cc.environment());

      FuncFrame frame;
      frame.init(func);
      frame.add_dirty_regs(eax, ymm0, ymm1, ymm2, ymm3);
      frame.finalize();

      cc.emit_prolog(frame);
      generate_avx_sequenceInternal(cc, form, eax, ymm0, ymm1, ymm2, ymm3);
      cc.emit_epilog(frame);
    }
    else {
      generate_avx_sequenceInternal(cc, form, eax, ymm0, ymm1, ymm2, ymm3);
    }

    return;
  }
#endif

  if (emitter.is_assembler()) {
    Assembler& cc = *emitter.as<Assembler>();

    if (emit_prolog_epilog) {
      FuncDetail func;
      func.init(FuncSignature::build<void, void*, const void*, size_t>(), cc.environment());

      FuncFrame frame;
      frame.init(func);
      frame.add_dirty_regs(eax, ymm0, ymm1, ymm2, ymm3);
      frame.finalize();

      cc.emit_prolog(frame);
      generate_avx_sequenceInternal(cc, form, eax, ymm0, ymm1, ymm2, ymm3);
      cc.emit_epilog(frame);
    }
    else {
      generate_avx_sequenceInternal(cc, form, eax, ymm0, ymm1, ymm2, ymm3);
    }

    return;
  }
}

// Generates a long sequence of AVX512 instructions.
template<typename Emitter>
static void generate_avx512_sequence_internal_reg_only(
  Emitter& cc,
  const x86::Gp& gp,
  const x86::KReg& kA, const x86::KReg& kB, const x86::KReg& kC,
  const x86::Vec& vec_a, const x86::Vec& vec_b, const x86::Vec& vec_c, const x86::Vec& vec_d) {

  x86::Gp gpd = gp.r32();
  x86::Gp gpq = gp.r64();
  x86::Gp gpz = cc.is_32bit() ? gpd : gpq;

  x86::Vec xmm_a = vec_a.xmm();
  x86::Vec xmm_b = vec_b.xmm();
  x86::Vec xmm_c = vec_c.xmm();
  x86::Vec xmm_d = vec_d.xmm();

  x86::Vec ymm_a = vec_a.ymm();
  x86::Vec ymm_b = vec_b.ymm();
  x86::Vec ymm_c = vec_c.ymm();

  x86::Vec zmm_a = vec_a.zmm();
  x86::Vec zmm_b = vec_b.zmm();
  x86::Vec zmm_c = vec_c.zmm();

  cc.xor_(gpd, gpd);
  cc.vxorps(xmm_a, xmm_a, xmm_a);
  cc.vxorps(xmm_b, xmm_b, xmm_b);
  cc.vxorps(xmm_c, xmm_c, xmm_c);
  cc.vxorps(xmm_d, xmm_d, xmm_d);

  cc.kaddb(kA, kB, kC);
  cc.kaddd(kA, kB, kC);
  cc.kaddq(kA, kB, kC);
  cc.kaddw(kA, kB, kC);
  cc.kandb(kA, kB, kC);
  cc.kandd(kA, kB, kC);
  cc.kandnb(kA, kB, kC);
  cc.kandnd(kA, kB, kC);
  cc.kandnq(kA, kB, kC);
  cc.kandnw(kA, kB, kC);
  cc.kandq(kA, kB, kC);
  cc.kandw(kA, kB, kC);
  cc.kmovb(kA, kB);
  cc.kmovb(kA, gpd);
  cc.kmovb(gpd, kB);
  cc.kmovd(kA, kB);
  cc.kmovd(kA, gpd);
  cc.kmovd(gpd, kB);
  cc.kmovq(kA, kB);
  if (cc.is_64bit()) cc.kmovq(kA, gpq);
  if (cc.is_64bit()) cc.kmovq(gpq, kB);
  cc.kmovw(kA, kB);
  cc.kmovw(kA, gpd);
  cc.kmovw(gpd, kB);
  cc.knotb(kA, kB);
  cc.knotd(kA, kB);
  cc.knotq(kA, kB);
  cc.knotw(kA, kB);
  cc.korb(kA, kB, kC);
  cc.kord(kA, kB, kC);
  cc.korq(kA, kB, kC);
  cc.kortestb(kA, kB);
  cc.kortestd(kA, kB);
  cc.kortestq(kA, kB);
  cc.kortestw(kA, kB);
  cc.korw(kA, kB, kC);
  cc.kshiftlb(kA, kB, 0);
  cc.kshiftld(kA, kB, 0);
  cc.kshiftlq(kA, kB, 0);
  cc.kshiftlw(kA, kB, 0);
  cc.kshiftrb(kA, kB, 0);
  cc.kshiftrd(kA, kB, 0);
  cc.kshiftrq(kA, kB, 0);
  cc.kshiftrw(kA, kB, 0);
  cc.ktestb(kA, kB);
  cc.ktestd(kA, kB);
  cc.ktestq(kA, kB);
  cc.ktestw(kA, kB);
  cc.kunpckbw(kA, kB, kC);
  cc.kunpckdq(kA, kB, kC);
  cc.kunpckwd(kA, kB, kC);
  cc.kxnorb(kA, kB, kC);
  cc.kxnord(kA, kB, kC);
  cc.kxnorq(kA, kB, kC);
  cc.kxnorw(kA, kB, kC);
  cc.kxorb(kA, kB, kC);
  cc.kxord(kA, kB, kC);
  cc.kxorq(kA, kB, kC);
  cc.kxorw(kA, kB, kC);
  cc.nop();

  cc.evex().vaddpd(xmm_a, xmm_b, xmm_c);
  cc.evex().vaddpd(ymm_a, ymm_b, ymm_c);
  cc.evex().vaddpd(zmm_a, zmm_b, zmm_c);
  cc.evex().vaddps(xmm_a, xmm_b, xmm_c);
  cc.evex().vaddps(ymm_a, ymm_b, ymm_c);
  cc.evex().vaddps(zmm_a, zmm_b, zmm_c);
  cc.evex().vaddsd(xmm_a, xmm_b, xmm_c);
  cc.evex().vaddss(xmm_a, xmm_b, xmm_c);
  cc.evex().valignd(xmm_a, xmm_b, xmm_c, 0);
  cc.evex().valignd(ymm_a, ymm_b, ymm_c, 0);
  cc.evex().valignd(zmm_a, zmm_b, zmm_c, 0);
  cc.evex().valignq(xmm_a, xmm_b, xmm_c, 0);
  cc.evex().valignq(ymm_a, ymm_b, ymm_c, 0);
  cc.evex().valignq(zmm_a, zmm_b, zmm_c, 0);
  cc.evex().vandnpd(xmm_a, xmm_b, xmm_c);
  cc.evex().vandnpd(ymm_a, ymm_b, ymm_c);
  cc.evex().vandnpd(zmm_a, zmm_b, zmm_c);
  cc.evex().vandnps(xmm_a, xmm_b, xmm_c);
  cc.evex().vandnps(ymm_a, ymm_b, ymm_c);
  cc.evex().vandnps(zmm_a, zmm_b, zmm_c);
  cc.evex().vandpd(xmm_a, xmm_b, xmm_c);
  cc.evex().vandpd(ymm_a, ymm_b, ymm_c);
  cc.evex().vandpd(zmm_a, zmm_b, zmm_c);
  cc.evex().vandps(xmm_a, xmm_b, xmm_c);
  cc.evex().vandps(ymm_a, ymm_b, ymm_c);
  cc.evex().vandps(zmm_a, zmm_b, zmm_c);
  cc.evex().vblendmpd(xmm_a, xmm_b, xmm_c);
  cc.evex().vblendmpd(ymm_a, ymm_b, ymm_c);
  cc.evex().vblendmpd(zmm_a, zmm_b, zmm_c);
  cc.evex().vblendmps(xmm_a, xmm_b, xmm_c);
  cc.evex().vblendmps(ymm_a, ymm_b, ymm_c);
  cc.evex().vblendmps(zmm_a, zmm_b, zmm_c);
  cc.evex().vbroadcastf32x2(ymm_a, xmm_b);
  cc.evex().vbroadcastf32x2(zmm_a, xmm_b);
  cc.evex().vbroadcasti32x2(xmm_a, xmm_b);
  cc.evex().vbroadcasti32x2(ymm_a, xmm_b);
  cc.evex().vbroadcasti32x2(zmm_a, xmm_b);
  cc.evex().vbroadcastsd(ymm_a, xmm_b);
  cc.evex().vbroadcastsd(zmm_a, xmm_b);
  cc.evex().vbroadcastss(xmm_a, xmm_b);
  cc.evex().vbroadcastss(ymm_a, xmm_b);
  cc.evex().vbroadcastss(zmm_a, xmm_b);
  cc.evex().vcmppd(kA, xmm_b, xmm_c, 0);
  cc.evex().vcmppd(kA, ymm_b, ymm_c, 0);
  cc.evex().vcmppd(kA, zmm_b, zmm_c, 0);
  cc.evex().vcmpps(kA, xmm_b, xmm_c, 0);
  cc.evex().vcmpps(kA, ymm_b, ymm_c, 0);
  cc.evex().vcmpps(kA, zmm_b, zmm_c, 0);
  cc.evex().vcmpsd(kA, xmm_b, xmm_c, 0);
  cc.evex().vcmpss(kA, xmm_b, xmm_c, 0);
  cc.evex().vcomisd(xmm_a, xmm_b);
  cc.evex().vcomiss(xmm_a, xmm_b);
  cc.evex().vcompresspd(xmm_a, xmm_b);
  cc.evex().vcompresspd(ymm_a, ymm_b);
  cc.evex().vcompresspd(zmm_a, zmm_b);
  cc.evex().vcompressps(xmm_a, xmm_b);
  cc.evex().vcompressps(ymm_a, ymm_b);
  cc.evex().vcompressps(zmm_a, zmm_b);
  cc.evex().vcvtdq2pd(xmm_a, xmm_b);
  cc.evex().vcvtdq2pd(ymm_a, xmm_b);
  cc.evex().vcvtdq2pd(zmm_a, ymm_b);
  cc.evex().vcvtdq2ps(xmm_a, xmm_b);
  cc.evex().vcvtdq2ps(ymm_a, ymm_b);
  cc.evex().vcvtdq2ps(zmm_a, zmm_b);
  cc.evex().vcvtpd2dq(xmm_a, xmm_b);
  cc.evex().vcvtpd2dq(xmm_a, ymm_b);
  cc.evex().vcvtpd2dq(ymm_a, zmm_b);
  cc.evex().vcvtpd2qq(xmm_a, xmm_b);
  cc.evex().vcvtpd2qq(ymm_a, ymm_b);
  cc.evex().vcvtpd2qq(zmm_a, zmm_b);
  cc.evex().vcvtpd2udq(xmm_a, xmm_b);
  cc.evex().vcvtpd2udq(xmm_a, ymm_b);
  cc.evex().vcvtpd2udq(ymm_a, zmm_b);
  cc.evex().vcvtpd2uqq(xmm_a, xmm_b);
  cc.evex().vcvtpd2uqq(ymm_a, ymm_b);
  cc.evex().vcvtpd2uqq(zmm_a, zmm_b);
  cc.evex().vcvtph2ps(xmm_a, xmm_b);
  cc.evex().vcvtph2ps(ymm_a, xmm_b);
  cc.evex().vcvtph2ps(zmm_a, ymm_b);
  cc.evex().vcvtps2dq(xmm_a, xmm_b);
  cc.evex().vcvtps2dq(ymm_a, ymm_b);
  cc.evex().vcvtps2dq(zmm_a, zmm_b);
  cc.evex().vcvtps2pd(xmm_a, xmm_b);
  cc.evex().vcvtps2pd(ymm_a, xmm_b);
  cc.evex().vcvtps2pd(zmm_a, ymm_b);
  cc.evex().vcvtps2ph(xmm_a, xmm_b, 0);
  cc.evex().vcvtps2ph(xmm_a, ymm_b, 0);
  cc.evex().vcvtps2ph(ymm_a, zmm_b, 0);
  cc.evex().vcvtps2qq(xmm_a, xmm_b);
  cc.evex().vcvtps2qq(ymm_a, xmm_b);
  cc.evex().vcvtps2qq(zmm_a, ymm_b);
  cc.evex().vcvtps2udq(xmm_a, xmm_b);
  cc.evex().vcvtps2udq(ymm_a, ymm_b);
  cc.evex().vcvtps2udq(zmm_a, zmm_b);
  cc.evex().vcvtps2uqq(xmm_a, xmm_b);
  cc.evex().vcvtps2uqq(ymm_a, xmm_b);
  cc.evex().vcvtps2uqq(zmm_a, ymm_b);
  cc.evex().vcvtqq2pd(xmm_a, xmm_b);
  cc.evex().vcvtqq2pd(ymm_a, ymm_b);
  cc.evex().vcvtqq2pd(zmm_a, zmm_b);
  cc.evex().vcvtqq2ps(xmm_a, xmm_b);
  cc.evex().vcvtqq2ps(xmm_a, ymm_b);
  cc.evex().vcvtqq2ps(ymm_a, zmm_b);
  cc.evex().vcvtsd2si(gpd, xmm_b);
  cc.evex().vcvtsd2si(gpz, xmm_b);
  cc.evex().vcvtsd2ss(xmm_a, xmm_b, xmm_c);
  cc.evex().vcvtsd2usi(gpd, xmm_b);
  cc.evex().vcvtsd2usi(gpz, xmm_b);
  cc.evex().vcvtsi2sd(xmm_a, xmm_b, gpd);
  cc.evex().vcvtsi2sd(xmm_a, xmm_b, gpz);
  cc.evex().vcvtsi2ss(xmm_a, xmm_b, gpd);
  cc.evex().vcvtsi2ss(xmm_a, xmm_b, gpz);
  cc.evex().vcvtss2sd(xmm_a, xmm_b, xmm_c);
  cc.evex().vcvtss2si(gpd, xmm_b);
  cc.evex().vcvtss2si(gpz, xmm_b);
  cc.evex().vcvtss2usi(gpd, xmm_b);
  cc.evex().vcvtss2usi(gpz, xmm_b);
  cc.evex().vcvttpd2dq(xmm_a, xmm_b);
  cc.evex().vcvttpd2dq(xmm_a, ymm_b);
  cc.evex().vcvttpd2dq(ymm_a, zmm_b);
  cc.evex().vcvttpd2qq(xmm_a, xmm_b);
  cc.evex().vcvttpd2qq(ymm_a, ymm_b);
  cc.evex().vcvttpd2qq(zmm_a, zmm_b);
  cc.evex().vcvttpd2udq(xmm_a, xmm_b);
  cc.evex().vcvttpd2udq(xmm_a, ymm_b);
  cc.evex().vcvttpd2udq(ymm_a, zmm_b);
  cc.evex().vcvttpd2uqq(xmm_a, xmm_b);
  cc.evex().vcvttpd2uqq(ymm_a, ymm_b);
  cc.evex().vcvttpd2uqq(zmm_a, zmm_b);
  cc.evex().vcvttps2dq(xmm_a, xmm_b);
  cc.evex().vcvttps2dq(ymm_a, ymm_b);
  cc.evex().vcvttps2dq(zmm_a, zmm_b);
  cc.evex().vcvttps2qq(xmm_a, xmm_b);
  cc.evex().vcvttps2qq(ymm_a, xmm_b);
  cc.evex().vcvttps2qq(zmm_a, ymm_b);
  cc.evex().vcvttps2udq(xmm_a, xmm_b);
  cc.evex().vcvttps2udq(ymm_a, ymm_b);
  cc.evex().vcvttps2udq(zmm_a, zmm_b);
  cc.evex().vcvttps2uqq(xmm_a, xmm_b);
  cc.evex().vcvttps2uqq(ymm_a, xmm_b);
  cc.evex().vcvttps2uqq(zmm_a, ymm_b);
  cc.evex().vcvttsd2si(gpd, xmm_b);
  cc.evex().vcvttsd2si(gpz, xmm_b);
  cc.evex().vcvttsd2usi(gpd, xmm_b);
  cc.evex().vcvttsd2usi(gpz, xmm_b);
  cc.evex().vcvttss2si(gpd, xmm_b);
  cc.evex().vcvttss2si(gpz, xmm_b);
  cc.evex().vcvttss2usi(gpd, xmm_b);
  cc.evex().vcvttss2usi(gpz, xmm_b);
  cc.evex().vcvtudq2pd(xmm_a, xmm_b);
  cc.evex().vcvtudq2pd(ymm_a, xmm_b);
  cc.evex().vcvtudq2pd(zmm_a, ymm_b);
  cc.evex().vcvtudq2ps(xmm_a, xmm_b);
  cc.evex().vcvtudq2ps(ymm_a, ymm_b);
  cc.evex().vcvtudq2ps(zmm_a, zmm_b);
  cc.evex().vcvtuqq2pd(xmm_a, xmm_b);
  cc.evex().vcvtuqq2pd(ymm_a, ymm_b);
  cc.evex().vcvtuqq2pd(zmm_a, zmm_b);
  cc.evex().vcvtuqq2ps(xmm_a, xmm_b);
  cc.evex().vcvtuqq2ps(xmm_a, ymm_b);
  cc.evex().vcvtuqq2ps(ymm_a, zmm_b);
  cc.evex().vcvtusi2sd(xmm_a, xmm_b, gpd);
  cc.evex().vcvtusi2sd(xmm_a, xmm_b, gpz);
  cc.evex().vcvtusi2ss(xmm_a, xmm_b, gpd);
  cc.evex().vcvtusi2ss(xmm_a, xmm_b, gpz);
  cc.evex().vdbpsadbw(xmm_a, xmm_b, xmm_c, 0);
  cc.evex().vdbpsadbw(ymm_a, ymm_b, ymm_c, 0);
  cc.evex().vdbpsadbw(zmm_a, zmm_b, zmm_c, 0);
  cc.evex().vdivpd(xmm_a, xmm_b, xmm_c);
  cc.evex().vdivpd(ymm_a, ymm_b, ymm_c);
  cc.evex().vdivpd(zmm_a, zmm_b, zmm_c);
  cc.evex().vdivps(xmm_a, xmm_b, xmm_c);
  cc.evex().vdivps(ymm_a, ymm_b, ymm_c);
  cc.evex().vdivps(zmm_a, zmm_b, zmm_c);
  cc.evex().vdivsd(xmm_a, xmm_b, xmm_c);
  cc.evex().vdivss(xmm_a, xmm_b, xmm_c);
  cc.evex().vexpandpd(xmm_a, xmm_b);
  cc.evex().vexpandpd(ymm_a, ymm_b);
  cc.evex().vexpandpd(zmm_a, zmm_b);
  cc.evex().vexpandps(xmm_a, xmm_b);
  cc.evex().vexpandps(ymm_a, ymm_b);
  cc.evex().vexpandps(zmm_a, zmm_b);
  cc.evex().vextractf32x4(xmm_a, ymm_b, 0);
  cc.evex().vextractf32x4(xmm_a, zmm_b, 0);
  cc.evex().vextractf32x8(ymm_a, zmm_b, 0);
  cc.evex().vextractf64x2(xmm_a, ymm_b, 0);
  cc.evex().vextractf64x2(xmm_a, zmm_b, 0);
  cc.evex().vextractf64x4(ymm_a, zmm_b, 0);
  cc.evex().vextracti32x4(xmm_a, ymm_b, 0);
  cc.evex().vextracti32x4(xmm_a, zmm_b, 0);
  cc.evex().vextracti32x8(ymm_a, zmm_b, 0);
  cc.evex().vextracti64x2(xmm_a, ymm_b, 0);
  cc.evex().vextracti64x2(xmm_a, zmm_b, 0);
  cc.evex().vextracti64x4(ymm_a, zmm_b, 0);
  cc.evex().vextractps(gpd, xmm_b, 0);
  cc.evex().vfixupimmpd(xmm_a, xmm_b, xmm_c, 0);
  cc.evex().vfixupimmpd(ymm_a, ymm_b, ymm_c, 0);
  cc.evex().vfixupimmpd(zmm_a, zmm_b, zmm_c, 0);
  cc.evex().vfixupimmps(xmm_a, xmm_b, xmm_c, 0);
  cc.evex().vfixupimmps(ymm_a, ymm_b, ymm_c, 0);
  cc.evex().vfixupimmps(zmm_a, zmm_b, zmm_c, 0);
  cc.evex().vfixupimmsd(xmm_a, xmm_b, xmm_c, 0);
  cc.evex().vfixupimmss(xmm_a, xmm_b, xmm_c, 0);
  cc.evex().vfmadd132pd(xmm_a, xmm_b, xmm_c);
  cc.evex().vfmadd132pd(ymm_a, ymm_b, ymm_c);
  cc.evex().vfmadd132pd(zmm_a, zmm_b, zmm_c);
  cc.evex().vfmadd132ps(xmm_a, xmm_b, xmm_c);
  cc.evex().vfmadd132ps(ymm_a, ymm_b, ymm_c);
  cc.evex().vfmadd132ps(zmm_a, zmm_b, zmm_c);
  cc.evex().vfmadd132sd(xmm_a, xmm_b, xmm_c);
  cc.evex().vfmadd132ss(xmm_a, xmm_b, xmm_c);
  cc.evex().vfmadd213pd(xmm_a, xmm_b, xmm_c);
  cc.evex().vfmadd213pd(ymm_a, ymm_b, ymm_c);
  cc.evex().vfmadd213pd(zmm_a, zmm_b, zmm_c);
  cc.evex().vfmadd213ps(xmm_a, xmm_b, xmm_c);
  cc.evex().vfmadd213ps(ymm_a, ymm_b, ymm_c);
  cc.evex().vfmadd213ps(zmm_a, zmm_b, zmm_c);
  cc.evex().vfmadd213sd(xmm_a, xmm_b, xmm_c);
  cc.evex().vfmadd213ss(xmm_a, xmm_b, xmm_c);
  cc.evex().vfmadd231pd(xmm_a, xmm_b, xmm_c);
  cc.evex().vfmadd231pd(ymm_a, ymm_b, ymm_c);
  cc.evex().vfmadd231pd(zmm_a, zmm_b, zmm_c);
  cc.evex().vfmadd231ps(xmm_a, xmm_b, xmm_c);
  cc.evex().vfmadd231ps(ymm_a, ymm_b, ymm_c);
  cc.evex().vfmadd231ps(zmm_a, zmm_b, zmm_c);
  cc.evex().vfmadd231sd(xmm_a, xmm_b, xmm_c);
  cc.evex().vfmadd231ss(xmm_a, xmm_b, xmm_c);
  cc.evex().vfmaddsub132pd(xmm_a, xmm_b, xmm_c);
  cc.evex().vfmaddsub132pd(ymm_a, ymm_b, ymm_c);
  cc.evex().vfmaddsub132pd(zmm_a, zmm_b, zmm_c);
  cc.evex().vfmaddsub132ps(xmm_a, xmm_b, xmm_c);
  cc.evex().vfmaddsub132ps(ymm_a, ymm_b, ymm_c);
  cc.evex().vfmaddsub132ps(zmm_a, zmm_b, zmm_c);
  cc.evex().vfmaddsub213pd(xmm_a, xmm_b, xmm_c);
  cc.evex().vfmaddsub213pd(ymm_a, ymm_b, ymm_c);
  cc.evex().vfmaddsub213pd(zmm_a, zmm_b, zmm_c);
  cc.evex().vfmaddsub213ps(xmm_a, xmm_b, xmm_c);
  cc.evex().vfmaddsub213ps(ymm_a, ymm_b, ymm_c);
  cc.evex().vfmaddsub213ps(zmm_a, zmm_b, zmm_c);
  cc.evex().vfmaddsub231pd(xmm_a, xmm_b, xmm_c);
  cc.evex().vfmaddsub231pd(ymm_a, ymm_b, ymm_c);
  cc.evex().vfmaddsub231pd(zmm_a, zmm_b, zmm_c);
  cc.evex().vfmaddsub231ps(xmm_a, xmm_b, xmm_c);
  cc.evex().vfmaddsub231ps(ymm_a, ymm_b, ymm_c);
  cc.evex().vfmaddsub231ps(zmm_a, zmm_b, zmm_c);
  cc.evex().vfmsub132pd(xmm_a, xmm_b, xmm_c);
  cc.evex().vfmsub132pd(ymm_a, ymm_b, ymm_c);
  cc.evex().vfmsub132pd(zmm_a, zmm_b, zmm_c);
  cc.evex().vfmsub132ps(xmm_a, xmm_b, xmm_c);
  cc.evex().vfmsub132ps(ymm_a, ymm_b, ymm_c);
  cc.evex().vfmsub132ps(zmm_a, zmm_b, zmm_c);
  cc.evex().vfmsub132sd(xmm_a, xmm_b, xmm_c);
  cc.evex().vfmsub132ss(xmm_a, xmm_b, xmm_c);
  cc.evex().vfmsub213pd(xmm_a, xmm_b, xmm_c);
  cc.evex().vfmsub213pd(ymm_a, ymm_b, ymm_c);
  cc.evex().vfmsub213pd(zmm_a, zmm_b, zmm_c);
  cc.evex().vfmsub213ps(xmm_a, xmm_b, xmm_c);
  cc.evex().vfmsub213ps(ymm_a, ymm_b, ymm_c);
  cc.evex().vfmsub213ps(zmm_a, zmm_b, zmm_c);
  cc.evex().vfmsub213sd(xmm_a, xmm_b, xmm_c);
  cc.evex().vfmsub213ss(xmm_a, xmm_b, xmm_c);
  cc.evex().vfmsub231pd(xmm_a, xmm_b, xmm_c);
  cc.evex().vfmsub231pd(ymm_a, ymm_b, ymm_c);
  cc.evex().vfmsub231pd(zmm_a, zmm_b, zmm_c);
  cc.evex().vfmsub231ps(xmm_a, xmm_b, xmm_c);
  cc.evex().vfmsub231ps(ymm_a, ymm_b, ymm_c);
  cc.evex().vfmsub231ps(zmm_a, zmm_b, zmm_c);
  cc.evex().vfmsub231sd(xmm_a, xmm_b, xmm_c);
  cc.evex().vfmsub231ss(xmm_a, xmm_b, xmm_c);
  cc.evex().vfmsubadd132pd(xmm_a, xmm_b, xmm_c);
  cc.evex().vfmsubadd132pd(ymm_a, ymm_b, ymm_c);
  cc.evex().vfmsubadd132pd(zmm_a, zmm_b, zmm_c);
  cc.evex().vfmsubadd132ps(xmm_a, xmm_b, xmm_c);
  cc.evex().vfmsubadd132ps(ymm_a, ymm_b, ymm_c);
  cc.evex().vfmsubadd132ps(zmm_a, zmm_b, zmm_c);
  cc.evex().vfmsubadd213pd(xmm_a, xmm_b, xmm_c);
  cc.evex().vfmsubadd213pd(ymm_a, ymm_b, ymm_c);
  cc.evex().vfmsubadd213pd(zmm_a, zmm_b, zmm_c);
  cc.evex().vfmsubadd213ps(xmm_a, xmm_b, xmm_c);
  cc.evex().vfmsubadd213ps(ymm_a, ymm_b, ymm_c);
  cc.evex().vfmsubadd213ps(zmm_a, zmm_b, zmm_c);
  cc.evex().vfmsubadd231pd(xmm_a, xmm_b, xmm_c);
  cc.evex().vfmsubadd231pd(ymm_a, ymm_b, ymm_c);
  cc.evex().vfmsubadd231pd(zmm_a, zmm_b, zmm_c);
  cc.evex().vfmsubadd231ps(xmm_a, xmm_b, xmm_c);
  cc.evex().vfmsubadd231ps(ymm_a, ymm_b, ymm_c);
  cc.evex().vfmsubadd231ps(zmm_a, zmm_b, zmm_c);
  cc.evex().vfnmadd132pd(xmm_a, xmm_b, xmm_c);
  cc.evex().vfnmadd132pd(ymm_a, ymm_b, ymm_c);
  cc.evex().vfnmadd132pd(zmm_a, zmm_b, zmm_c);
  cc.evex().vfnmadd132ps(xmm_a, xmm_b, xmm_c);
  cc.evex().vfnmadd132ps(ymm_a, ymm_b, ymm_c);
  cc.evex().vfnmadd132ps(zmm_a, zmm_b, zmm_c);
  cc.evex().vfnmadd132sd(xmm_a, xmm_b, xmm_c);
  cc.evex().vfnmadd132ss(xmm_a, xmm_b, xmm_c);
  cc.evex().vfnmadd213pd(xmm_a, xmm_b, xmm_c);
  cc.evex().vfnmadd213pd(ymm_a, ymm_b, ymm_c);
  cc.evex().vfnmadd213pd(zmm_a, zmm_b, zmm_c);
  cc.evex().vfnmadd213ps(xmm_a, xmm_b, xmm_c);
  cc.evex().vfnmadd213ps(ymm_a, ymm_b, ymm_c);
  cc.evex().vfnmadd213ps(zmm_a, zmm_b, zmm_c);
  cc.evex().vfnmadd213sd(xmm_a, xmm_b, xmm_c);
  cc.evex().vfnmadd213ss(xmm_a, xmm_b, xmm_c);
  cc.evex().vfnmadd231pd(xmm_a, xmm_b, xmm_c);
  cc.evex().vfnmadd231pd(ymm_a, ymm_b, ymm_c);
  cc.evex().vfnmadd231pd(zmm_a, zmm_b, zmm_c);
  cc.evex().vfnmadd231ps(xmm_a, xmm_b, xmm_c);
  cc.evex().vfnmadd231ps(ymm_a, ymm_b, ymm_c);
  cc.evex().vfnmadd231ps(zmm_a, zmm_b, zmm_c);
  cc.evex().vfnmadd231sd(xmm_a, xmm_b, xmm_c);
  cc.evex().vfnmadd231ss(xmm_a, xmm_b, xmm_c);
  cc.evex().vfnmsub132pd(xmm_a, xmm_b, xmm_c);
  cc.evex().vfnmsub132pd(ymm_a, ymm_b, ymm_c);
  cc.evex().vfnmsub132pd(zmm_a, zmm_b, zmm_c);
  cc.evex().vfnmsub132ps(xmm_a, xmm_b, xmm_c);
  cc.evex().vfnmsub132ps(ymm_a, ymm_b, ymm_c);
  cc.evex().vfnmsub132ps(zmm_a, zmm_b, zmm_c);
  cc.evex().vfnmsub132sd(xmm_a, xmm_b, xmm_c);
  cc.evex().vfnmsub132ss(xmm_a, xmm_b, xmm_c);
  cc.evex().vfnmsub213pd(xmm_a, xmm_b, xmm_c);
  cc.evex().vfnmsub213pd(ymm_a, ymm_b, ymm_c);
  cc.evex().vfnmsub213pd(zmm_a, zmm_b, zmm_c);
  cc.evex().vfnmsub213ps(xmm_a, xmm_b, xmm_c);
  cc.evex().vfnmsub213ps(ymm_a, ymm_b, ymm_c);
  cc.evex().vfnmsub213ps(zmm_a, zmm_b, zmm_c);
  cc.evex().vfnmsub213sd(xmm_a, xmm_b, xmm_c);
  cc.evex().vfnmsub213ss(xmm_a, xmm_b, xmm_c);
  cc.evex().vfnmsub231pd(xmm_a, xmm_b, xmm_c);
  cc.evex().vfnmsub231pd(ymm_a, ymm_b, ymm_c);
  cc.evex().vfnmsub231pd(zmm_a, zmm_b, zmm_c);
  cc.evex().vfnmsub231ps(xmm_a, xmm_b, xmm_c);
  cc.evex().vfnmsub231ps(ymm_a, ymm_b, ymm_c);
  cc.evex().vfnmsub231ps(zmm_a, zmm_b, zmm_c);
  cc.evex().vfnmsub231sd(xmm_a, xmm_b, xmm_c);
  cc.evex().vfnmsub231ss(xmm_a, xmm_b, xmm_c);
  cc.evex().vfpclasspd(kA, xmm_b, 0);
  cc.evex().vfpclasspd(kA, ymm_b, 0);
  cc.evex().vfpclasspd(kA, zmm_b, 0);
  cc.evex().vfpclassps(kA, xmm_b, 0);
  cc.evex().vfpclassps(kA, ymm_b, 0);
  cc.evex().vfpclassps(kA, zmm_b, 0);
  cc.evex().vfpclasssd(kA, xmm_b, 0);
  cc.evex().vfpclassss(kA, xmm_b, 0);
  cc.evex().vgetexppd(xmm_a, xmm_b);
  cc.evex().vgetexppd(ymm_a, ymm_b);
  cc.evex().vgetexppd(zmm_a, zmm_b);
  cc.evex().vgetexpps(xmm_a, xmm_b);
  cc.evex().vgetexpps(ymm_a, ymm_b);
  cc.evex().vgetexpps(zmm_a, zmm_b);
  cc.evex().vgetexpsd(xmm_a, xmm_b, xmm_c);
  cc.evex().vgetexpss(xmm_a, xmm_b, xmm_c);
  cc.evex().vgetmantpd(xmm_a, xmm_b, 0);
  cc.evex().vgetmantpd(ymm_a, ymm_b, 0);
  cc.evex().vgetmantpd(zmm_a, zmm_b, 0);
  cc.evex().vgetmantps(xmm_a, xmm_b, 0);
  cc.evex().vgetmantps(ymm_a, ymm_b, 0);
  cc.evex().vgetmantps(zmm_a, zmm_b, 0);
  cc.evex().vgetmantsd(xmm_a, xmm_b, xmm_c, 0);
  cc.evex().vgetmantss(xmm_a, xmm_b, xmm_c, 0);
  cc.evex().vinsertf32x4(ymm_a, ymm_b, xmm_c, 0);
  cc.evex().vinsertf32x4(zmm_a, zmm_b, xmm_c, 0);
  cc.evex().vinsertf32x8(zmm_a, zmm_b, ymm_c, 0);
  cc.evex().vinsertf64x2(ymm_a, ymm_b, xmm_c, 0);
  cc.evex().vinsertf64x2(zmm_a, zmm_b, xmm_c, 0);
  cc.evex().vinsertf64x4(zmm_a, zmm_b, ymm_c, 0);
  cc.evex().vinserti32x4(ymm_a, ymm_b, xmm_c, 0);
  cc.evex().vinserti32x4(zmm_a, zmm_b, xmm_c, 0);
  cc.evex().vinserti32x8(zmm_a, zmm_b, ymm_c, 0);
  cc.evex().vinserti64x2(ymm_a, ymm_b, xmm_c, 0);
  cc.evex().vinserti64x2(zmm_a, zmm_b, xmm_c, 0);
  cc.evex().vinserti64x4(zmm_a, zmm_b, ymm_c, 0);
  cc.evex().vinsertps(xmm_a, xmm_b, xmm_c, 0);
  cc.evex().vmaxpd(xmm_a, xmm_b, xmm_c);
  cc.evex().vmaxpd(ymm_a, ymm_b, ymm_c);
  cc.evex().vmaxpd(zmm_a, zmm_b, zmm_c);
  cc.evex().vmaxps(xmm_a, xmm_b, xmm_c);
  cc.evex().vmaxps(ymm_a, ymm_b, ymm_c);
  cc.evex().vmaxps(zmm_a, zmm_b, zmm_c);
  cc.evex().vmaxsd(xmm_a, xmm_b, xmm_c);
  cc.evex().vmaxss(xmm_a, xmm_b, xmm_c);
  cc.evex().vminpd(xmm_a, xmm_b, xmm_c);
  cc.evex().vminpd(ymm_a, ymm_b, ymm_c);
  cc.evex().vminpd(zmm_a, zmm_b, zmm_c);
  cc.evex().vminps(xmm_a, xmm_b, xmm_c);
  cc.evex().vminps(ymm_a, ymm_b, ymm_c);
  cc.evex().vminps(zmm_a, zmm_b, zmm_c);
  cc.evex().vminsd(xmm_a, xmm_b, xmm_c);
  cc.evex().vminss(xmm_a, xmm_b, xmm_c);
  cc.evex().vmovapd(xmm_a, xmm_b);
  cc.evex().vmovapd(xmm_a, xmm_b);
  cc.evex().vmovapd(ymm_a, ymm_b);
  cc.evex().vmovapd(ymm_a, ymm_b);
  cc.evex().vmovapd(zmm_a, zmm_b);
  cc.evex().vmovapd(zmm_a, zmm_b);
  cc.evex().vmovaps(xmm_a, xmm_b);
  cc.evex().vmovaps(xmm_a, xmm_b);
  cc.evex().vmovaps(ymm_a, ymm_b);
  cc.evex().vmovaps(ymm_a, ymm_b);
  cc.evex().vmovaps(zmm_a, zmm_b);
  cc.evex().vmovaps(zmm_a, zmm_b);
  cc.evex().vmovd(gpd, xmm_b);
  cc.evex().vmovd(xmm_a, gpd);
  cc.evex().vmovddup(xmm_a, xmm_b);
  cc.evex().vmovddup(ymm_a, ymm_b);
  cc.evex().vmovddup(zmm_a, zmm_b);
  cc.evex().vmovdqa32(xmm_a, xmm_b);
  cc.evex().vmovdqa32(xmm_a, xmm_b);
  cc.evex().vmovdqa32(ymm_a, ymm_b);
  cc.evex().vmovdqa32(ymm_a, ymm_b);
  cc.evex().vmovdqa32(zmm_a, zmm_b);
  cc.evex().vmovdqa32(zmm_a, zmm_b);
  cc.evex().vmovdqa64(xmm_a, xmm_b);
  cc.evex().vmovdqa64(xmm_a, xmm_b);
  cc.evex().vmovdqa64(ymm_a, ymm_b);
  cc.evex().vmovdqa64(ymm_a, ymm_b);
  cc.evex().vmovdqa64(zmm_a, zmm_b);
  cc.evex().vmovdqa64(zmm_a, zmm_b);
  cc.evex().vmovdqu16(xmm_a, xmm_b);
  cc.evex().vmovdqu16(xmm_a, xmm_b);
  cc.evex().vmovdqu16(ymm_a, ymm_b);
  cc.evex().vmovdqu16(ymm_a, ymm_b);
  cc.evex().vmovdqu16(zmm_a, zmm_b);
  cc.evex().vmovdqu16(zmm_a, zmm_b);
  cc.evex().vmovdqu32(xmm_a, xmm_b);
  cc.evex().vmovdqu32(xmm_a, xmm_b);
  cc.evex().vmovdqu32(ymm_a, ymm_b);
  cc.evex().vmovdqu32(ymm_a, ymm_b);
  cc.evex().vmovdqu32(zmm_a, zmm_b);
  cc.evex().vmovdqu32(zmm_a, zmm_b);
  cc.evex().vmovdqu64(xmm_a, xmm_b);
  cc.evex().vmovdqu64(xmm_a, xmm_b);
  cc.evex().vmovdqu64(ymm_a, ymm_b);
  cc.evex().vmovdqu64(ymm_a, ymm_b);
  cc.evex().vmovdqu64(zmm_a, zmm_b);
  cc.evex().vmovdqu64(zmm_a, zmm_b);
  cc.evex().vmovdqu8(xmm_a, xmm_b);
  cc.evex().vmovdqu8(xmm_a, xmm_b);
  cc.evex().vmovdqu8(ymm_a, ymm_b);
  cc.evex().vmovdqu8(ymm_a, ymm_b);
  cc.evex().vmovdqu8(zmm_a, zmm_b);
  cc.evex().vmovdqu8(zmm_a, zmm_b);
  cc.evex().vmovhlps(xmm_a, xmm_b, xmm_c);
  if (cc.is_64bit()) cc.evex().vmovq(gpq, xmm_b);
  if (cc.is_64bit()) cc.evex().vmovq(xmm_a, gpq);
  cc.evex().vmovq(xmm_a, xmm_b);
  cc.evex().vmovsd(xmm_a, xmm_b, xmm_c);
  cc.evex().vmovshdup(xmm_a, xmm_b);
  cc.evex().vmovshdup(ymm_a, ymm_b);
  cc.evex().vmovshdup(zmm_a, zmm_b);
  cc.evex().vmovsldup(xmm_a, xmm_b);
  cc.evex().vmovsldup(ymm_a, ymm_b);
  cc.evex().vmovsldup(zmm_a, zmm_b);
  cc.evex().vmovss(xmm_a, xmm_b, xmm_c);
  cc.evex().vmovupd(xmm_a, xmm_b);
  cc.evex().vmovupd(xmm_a, xmm_b);
  cc.evex().vmovupd(ymm_a, ymm_b);
  cc.evex().vmovupd(ymm_a, ymm_b);
  cc.evex().vmovupd(zmm_a, zmm_b);
  cc.evex().vmovupd(zmm_a, zmm_b);
  cc.evex().vmovups(xmm_a, xmm_b);
  cc.evex().vmovups(xmm_a, xmm_b);
  cc.evex().vmovups(ymm_a, ymm_b);
  cc.evex().vmovups(ymm_a, ymm_b);
  cc.evex().vmovups(zmm_a, zmm_b);
  cc.evex().vmovups(zmm_a, zmm_b);
  cc.evex().vmulpd(xmm_a, xmm_b, xmm_c);
  cc.evex().vmulpd(ymm_a, ymm_b, ymm_c);
  cc.evex().vmulpd(zmm_a, zmm_b, zmm_c);
  cc.evex().vmulps(xmm_a, xmm_b, xmm_c);
  cc.evex().vmulps(ymm_a, ymm_b, ymm_c);
  cc.evex().vmulps(zmm_a, zmm_b, zmm_c);
  cc.evex().vmulsd(xmm_a, xmm_b, xmm_c);
  cc.evex().vmulss(xmm_a, xmm_b, xmm_c);
  cc.evex().vorpd(xmm_a, xmm_b, xmm_c);
  cc.evex().vorpd(ymm_a, ymm_b, ymm_c);
  cc.evex().vorpd(zmm_a, zmm_b, zmm_c);
  cc.evex().vorps(xmm_a, xmm_b, xmm_c);
  cc.evex().vorps(ymm_a, ymm_b, ymm_c);
  cc.evex().vorps(zmm_a, zmm_b, zmm_c);
  cc.evex().vpabsb(xmm_a, xmm_b);
  cc.evex().vpabsb(ymm_a, ymm_b);
  cc.evex().vpabsb(zmm_a, zmm_b);
  cc.evex().vpabsd(xmm_a, xmm_b);
  cc.evex().vpabsd(ymm_a, ymm_b);
  cc.evex().vpabsd(zmm_a, zmm_b);
  cc.evex().vpabsq(xmm_a, xmm_b);
  cc.evex().vpabsq(ymm_a, ymm_b);
  cc.evex().vpabsq(zmm_a, zmm_b);
  cc.evex().vpabsw(xmm_a, xmm_b);
  cc.evex().vpabsw(ymm_a, ymm_b);
  cc.evex().vpabsw(zmm_a, zmm_b);
  cc.evex().vpackssdw(xmm_a, xmm_b, xmm_c);
  cc.evex().vpackssdw(ymm_a, ymm_b, ymm_c);
  cc.evex().vpackssdw(zmm_a, zmm_b, zmm_c);
  cc.evex().vpacksswb(xmm_a, xmm_b, xmm_c);
  cc.evex().vpacksswb(ymm_a, ymm_b, ymm_c);
  cc.evex().vpacksswb(zmm_a, zmm_b, zmm_c);
  cc.evex().vpackusdw(xmm_a, xmm_b, xmm_c);
  cc.evex().vpackusdw(ymm_a, ymm_b, ymm_c);
  cc.evex().vpackusdw(zmm_a, zmm_b, zmm_c);
  cc.evex().vpackuswb(xmm_a, xmm_b, xmm_c);
  cc.evex().vpackuswb(ymm_a, ymm_b, ymm_c);
  cc.evex().vpackuswb(zmm_a, zmm_b, zmm_c);
  cc.evex().vpaddb(xmm_a, xmm_b, xmm_c);
  cc.evex().vpaddb(ymm_a, ymm_b, ymm_c);
  cc.evex().vpaddb(zmm_a, zmm_b, zmm_c);
  cc.evex().vpaddd(xmm_a, xmm_b, xmm_c);
  cc.evex().vpaddd(ymm_a, ymm_b, ymm_c);
  cc.evex().vpaddd(zmm_a, zmm_b, zmm_c);
  cc.evex().vpaddq(xmm_a, xmm_b, xmm_c);
  cc.evex().vpaddq(ymm_a, ymm_b, ymm_c);
  cc.evex().vpaddq(zmm_a, zmm_b, zmm_c);
  cc.evex().vpaddsb(xmm_a, xmm_b, xmm_c);
  cc.evex().vpaddsb(ymm_a, ymm_b, ymm_c);
  cc.evex().vpaddsb(zmm_a, zmm_b, zmm_c);
  cc.evex().vpaddsw(xmm_a, xmm_b, xmm_c);
  cc.evex().vpaddsw(ymm_a, ymm_b, ymm_c);
  cc.evex().vpaddsw(zmm_a, zmm_b, zmm_c);
  cc.evex().vpaddusb(xmm_a, xmm_b, xmm_c);
  cc.evex().vpaddusb(ymm_a, ymm_b, ymm_c);
  cc.evex().vpaddusb(zmm_a, zmm_b, zmm_c);
  cc.evex().vpaddusw(xmm_a, xmm_b, xmm_c);
  cc.evex().vpaddusw(ymm_a, ymm_b, ymm_c);
  cc.evex().vpaddusw(zmm_a, zmm_b, zmm_c);
  cc.evex().vpaddw(xmm_a, xmm_b, xmm_c);
  cc.evex().vpaddw(ymm_a, ymm_b, ymm_c);
  cc.evex().vpaddw(zmm_a, zmm_b, zmm_c);
  cc.evex().vpalignr(xmm_a, xmm_b, xmm_c, 0);
  cc.evex().vpalignr(ymm_a, ymm_b, ymm_c, 0);
  cc.evex().vpalignr(zmm_a, zmm_b, zmm_c, 0);
  cc.evex().vpandd(xmm_a, xmm_b, xmm_c);
  cc.evex().vpandd(ymm_a, ymm_b, ymm_c);
  cc.evex().vpandd(zmm_a, zmm_b, zmm_c);
  cc.evex().vpandnd(xmm_a, xmm_b, xmm_c);
  cc.evex().vpandnd(ymm_a, ymm_b, ymm_c);
  cc.evex().vpandnd(zmm_a, zmm_b, zmm_c);
  cc.evex().vpandnq(xmm_a, xmm_b, xmm_c);
  cc.evex().vpandnq(ymm_a, ymm_b, ymm_c);
  cc.evex().vpandnq(zmm_a, zmm_b, zmm_c);
  cc.evex().vpandq(xmm_a, xmm_b, xmm_c);
  cc.evex().vpandq(ymm_a, ymm_b, ymm_c);
  cc.evex().vpandq(zmm_a, zmm_b, zmm_c);
  cc.evex().vpavgb(xmm_a, xmm_b, xmm_c);
  cc.evex().vpavgb(ymm_a, ymm_b, ymm_c);
  cc.evex().vpavgb(zmm_a, zmm_b, zmm_c);
  cc.evex().vpavgw(xmm_a, xmm_b, xmm_c);
  cc.evex().vpavgw(ymm_a, ymm_b, ymm_c);
  cc.evex().vpavgw(zmm_a, zmm_b, zmm_c);
  cc.evex().vpblendmb(xmm_a, xmm_b, xmm_c);
  cc.evex().vpblendmb(ymm_a, ymm_b, ymm_c);
  cc.evex().vpblendmb(zmm_a, zmm_b, zmm_c);
  cc.evex().vpblendmd(xmm_a, xmm_b, xmm_c);
  cc.evex().vpblendmd(ymm_a, ymm_b, ymm_c);
  cc.evex().vpblendmd(zmm_a, zmm_b, zmm_c);
  cc.evex().vpblendmq(xmm_a, xmm_b, xmm_c);
  cc.evex().vpblendmq(ymm_a, ymm_b, ymm_c);
  cc.evex().vpblendmq(zmm_a, zmm_b, zmm_c);
  cc.evex().vpblendmw(xmm_a, xmm_b, xmm_c);
  cc.evex().vpblendmw(ymm_a, ymm_b, ymm_c);
  cc.evex().vpblendmw(zmm_a, zmm_b, zmm_c);
  cc.evex().vpbroadcastb(xmm_a, gpd);
  cc.evex().vpbroadcastb(xmm_a, xmm_b);
  cc.evex().vpbroadcastb(ymm_a, gpd);
  cc.evex().vpbroadcastb(ymm_a, xmm_b);
  cc.evex().vpbroadcastb(zmm_a, gpd);
  cc.evex().vpbroadcastb(zmm_a, xmm_b);
  cc.evex().vpbroadcastd(xmm_a, gpd);
  cc.evex().vpbroadcastd(xmm_a, xmm_b);
  cc.evex().vpbroadcastd(ymm_a, gpd);
  cc.evex().vpbroadcastd(ymm_a, xmm_b);
  cc.evex().vpbroadcastd(zmm_a, gpd);
  cc.evex().vpbroadcastd(zmm_a, xmm_b);
  cc.evex().vpbroadcastmb2q(xmm_a, kB);
  cc.evex().vpbroadcastmb2q(ymm_a, kB);
  cc.evex().vpbroadcastmb2q(zmm_a, kB);
  cc.evex().vpbroadcastmw2d(xmm_a, kB);
  cc.evex().vpbroadcastmw2d(ymm_a, kB);
  cc.evex().vpbroadcastmw2d(zmm_a, kB);
  if (cc.is_64bit()) cc.evex().vpbroadcastq(xmm_a, gpq);
  cc.evex().vpbroadcastq(xmm_a, xmm_b);
  if (cc.is_64bit()) cc.evex().vpbroadcastq(ymm_a, gpq);
  cc.evex().vpbroadcastq(ymm_a, xmm_b);
  if (cc.is_64bit()) cc.evex().vpbroadcastq(zmm_a, gpq);
  cc.evex().vpbroadcastq(zmm_a, xmm_b);
  cc.evex().vpbroadcastw(xmm_a, gpd);
  cc.evex().vpbroadcastw(xmm_a, xmm_b);
  cc.evex().vpbroadcastw(ymm_a, gpd);
  cc.evex().vpbroadcastw(ymm_a, xmm_b);
  cc.evex().vpbroadcastw(zmm_a, gpd);
  cc.evex().vpbroadcastw(zmm_a, xmm_b);
  cc.evex().vpcmpb(kA, xmm_b, xmm_c, 0);
  cc.evex().vpcmpb(kA, ymm_b, ymm_c, 0);
  cc.evex().vpcmpb(kA, zmm_b, zmm_c, 0);
  cc.evex().vpcmpd(kA, xmm_b, xmm_c, 0);
  cc.evex().vpcmpd(kA, ymm_b, ymm_c, 0);
  cc.evex().vpcmpd(kA, zmm_b, zmm_c, 0);
  cc.evex().vpcmpeqb(kA, xmm_b, xmm_c);
  cc.evex().vpcmpeqb(kA, ymm_b, ymm_c);
  cc.evex().vpcmpeqb(kA, zmm_b, zmm_c);
  cc.evex().vpcmpeqd(kA, xmm_b, xmm_c);
  cc.evex().vpcmpeqd(kA, ymm_b, ymm_c);
  cc.evex().vpcmpeqd(kA, zmm_b, zmm_c);
  cc.evex().vpcmpeqq(kA, xmm_b, xmm_c);
  cc.evex().vpcmpeqq(kA, ymm_b, ymm_c);
  cc.evex().vpcmpeqq(kA, zmm_b, zmm_c);
  cc.evex().vpcmpeqw(kA, xmm_b, xmm_c);
  cc.evex().vpcmpeqw(kA, ymm_b, ymm_c);
  cc.evex().vpcmpeqw(kA, zmm_b, zmm_c);
  cc.evex().vpcmpgtb(kA, xmm_b, xmm_c);
  cc.evex().vpcmpgtb(kA, ymm_b, ymm_c);
  cc.evex().vpcmpgtb(kA, zmm_b, zmm_c);
  cc.evex().vpcmpgtd(kA, xmm_b, xmm_c);
  cc.evex().vpcmpgtd(kA, ymm_b, ymm_c);
  cc.evex().vpcmpgtd(kA, zmm_b, zmm_c);
  cc.evex().vpcmpgtq(kA, xmm_b, xmm_c);
  cc.evex().vpcmpgtq(kA, ymm_b, ymm_c);
  cc.evex().vpcmpgtq(kA, zmm_b, zmm_c);
  cc.evex().vpcmpgtw(kA, xmm_b, xmm_c);
  cc.evex().vpcmpgtw(kA, ymm_b, ymm_c);
  cc.evex().vpcmpgtw(kA, zmm_b, zmm_c);
  cc.evex().vpcmpq(kA, xmm_b, xmm_c, 0);
  cc.evex().vpcmpq(kA, ymm_b, ymm_c, 0);
  cc.evex().vpcmpq(kA, zmm_b, zmm_c, 0);
  cc.evex().vpcmpub(kA, xmm_b, xmm_c, 0);
  cc.evex().vpcmpub(kA, ymm_b, ymm_c, 0);
  cc.evex().vpcmpub(kA, zmm_b, zmm_c, 0);
  cc.evex().vpcmpud(kA, xmm_b, xmm_c, 0);
  cc.evex().vpcmpud(kA, ymm_b, ymm_c, 0);
  cc.evex().vpcmpud(kA, zmm_b, zmm_c, 0);
  cc.evex().vpcmpuq(kA, xmm_b, xmm_c, 0);
  cc.evex().vpcmpuq(kA, ymm_b, ymm_c, 0);
  cc.evex().vpcmpuq(kA, zmm_b, zmm_c, 0);
  cc.evex().vpcmpuw(kA, xmm_b, xmm_c, 0);
  cc.evex().vpcmpuw(kA, ymm_b, ymm_c, 0);
  cc.evex().vpcmpuw(kA, zmm_b, zmm_c, 0);
  cc.evex().vpcmpw(kA, xmm_b, xmm_c, 0);
  cc.evex().vpcmpw(kA, ymm_b, ymm_c, 0);
  cc.evex().vpcmpw(kA, zmm_b, zmm_c, 0);
  cc.evex().vpcompressd(xmm_a, xmm_b);
  cc.evex().vpcompressd(ymm_a, ymm_b);
  cc.evex().vpcompressd(zmm_a, zmm_b);
  cc.evex().vpcompressq(xmm_a, xmm_b);
  cc.evex().vpcompressq(ymm_a, ymm_b);
  cc.evex().vpcompressq(zmm_a, zmm_b);
  cc.evex().vpconflictd(xmm_a, xmm_b);
  cc.evex().vpconflictd(ymm_a, ymm_b);
  cc.evex().vpconflictd(zmm_a, zmm_b);
  cc.evex().vpconflictq(xmm_a, xmm_b);
  cc.evex().vpconflictq(ymm_a, ymm_b);
  cc.evex().vpconflictq(zmm_a, zmm_b);
  cc.evex().vpermb(xmm_a, xmm_b, xmm_c);
  cc.evex().vpermb(ymm_a, ymm_b, ymm_c);
  cc.evex().vpermb(zmm_a, zmm_b, zmm_c);
  cc.evex().vpermd(ymm_a, ymm_b, ymm_c);
  cc.evex().vpermd(zmm_a, zmm_b, zmm_c);
  cc.evex().vpermi2b(xmm_a, xmm_b, xmm_c);
  cc.evex().vpermi2b(ymm_a, ymm_b, ymm_c);
  cc.evex().vpermi2b(zmm_a, zmm_b, zmm_c);
  cc.evex().vpermi2d(xmm_a, xmm_b, xmm_c);
  cc.evex().vpermi2d(ymm_a, ymm_b, ymm_c);
  cc.evex().vpermi2d(zmm_a, zmm_b, zmm_c);
  cc.evex().vpermi2pd(xmm_a, xmm_b, xmm_c);
  cc.evex().vpermi2pd(ymm_a, ymm_b, ymm_c);
  cc.evex().vpermi2pd(zmm_a, zmm_b, zmm_c);
  cc.evex().vpermi2ps(xmm_a, xmm_b, xmm_c);
  cc.evex().vpermi2ps(ymm_a, ymm_b, ymm_c);
  cc.evex().vpermi2ps(zmm_a, zmm_b, zmm_c);
  cc.evex().vpermi2q(xmm_a, xmm_b, xmm_c);
  cc.evex().vpermi2q(ymm_a, ymm_b, ymm_c);
  cc.evex().vpermi2q(zmm_a, zmm_b, zmm_c);
  cc.evex().vpermi2w(xmm_a, xmm_b, xmm_c);
  cc.evex().vpermi2w(ymm_a, ymm_b, ymm_c);
  cc.evex().vpermi2w(zmm_a, zmm_b, zmm_c);
  cc.evex().vpermilpd(xmm_a, xmm_b, xmm_c);
  cc.evex().vpermilpd(ymm_a, ymm_b, ymm_c);
  cc.evex().vpermilpd(zmm_a, zmm_b, zmm_c);
  cc.evex().vpermilpd(xmm_a, xmm_b, 0);
  cc.evex().vpermilpd(ymm_a, ymm_b, 0);
  cc.evex().vpermilpd(zmm_a, zmm_b, 0);
  cc.evex().vpermilps(xmm_a, xmm_b, xmm_c);
  cc.evex().vpermilps(ymm_a, ymm_b, ymm_c);
  cc.evex().vpermilps(zmm_a, zmm_b, zmm_c);
  cc.evex().vpermilps(xmm_a, xmm_b, 0);
  cc.evex().vpermilps(ymm_a, ymm_b, 0);
  cc.evex().vpermilps(zmm_a, zmm_b, 0);
  cc.evex().vpermq(ymm_a, ymm_b, ymm_c);
  cc.evex().vpermq(zmm_a, zmm_b, zmm_c);
  cc.evex().vpermq(ymm_a, ymm_b, 0);
  cc.evex().vpermq(zmm_a, zmm_b, 0);
  cc.evex().vpermt2b(xmm_a, xmm_b, xmm_c);
  cc.evex().vpermt2b(ymm_a, ymm_b, ymm_c);
  cc.evex().vpermt2b(zmm_a, zmm_b, zmm_c);
  cc.evex().vpermt2d(xmm_a, xmm_b, xmm_c);
  cc.evex().vpermt2d(ymm_a, ymm_b, ymm_c);
  cc.evex().vpermt2d(zmm_a, zmm_b, zmm_c);
  cc.evex().vpermt2pd(xmm_a, xmm_b, xmm_c);
  cc.evex().vpermt2pd(ymm_a, ymm_b, ymm_c);
  cc.evex().vpermt2pd(zmm_a, zmm_b, zmm_c);
  cc.evex().vpermt2ps(xmm_a, xmm_b, xmm_c);
  cc.evex().vpermt2ps(ymm_a, ymm_b, ymm_c);
  cc.evex().vpermt2ps(zmm_a, zmm_b, zmm_c);
  cc.evex().vpermt2q(xmm_a, xmm_b, xmm_c);
  cc.evex().vpermt2q(ymm_a, ymm_b, ymm_c);
  cc.evex().vpermt2q(zmm_a, zmm_b, zmm_c);
  cc.evex().vpermt2w(xmm_a, xmm_b, xmm_c);
  cc.evex().vpermt2w(ymm_a, ymm_b, ymm_c);
  cc.evex().vpermt2w(zmm_a, zmm_b, zmm_c);
  cc.evex().vpermw(xmm_a, xmm_b, xmm_c);
  cc.evex().vpermw(ymm_a, ymm_b, ymm_c);
  cc.evex().vpermw(zmm_a, zmm_b, zmm_c);
  cc.evex().vpexpandd(xmm_a, xmm_b);
  cc.evex().vpexpandd(ymm_a, ymm_b);
  cc.evex().vpexpandd(zmm_a, zmm_b);
  cc.evex().vpexpandq(xmm_a, xmm_b);
  cc.evex().vpexpandq(ymm_a, ymm_b);
  cc.evex().vpexpandq(zmm_a, zmm_b);
  cc.evex().vpextrb(gpd, xmm_b, 0);
  cc.evex().vpextrd(gpd, xmm_b, 0);
  if (cc.is_64bit()) cc.evex().vpextrq(gpq, xmm_b, 0);
  cc.evex().vpextrw(gpd, xmm_b, 0);
  cc.evex().vpinsrb(xmm_a, xmm_b, gpd, 0);
  cc.evex().vpinsrd(xmm_a, xmm_b, gpd, 0);
  if (cc.is_64bit()) cc.evex().vpinsrq(xmm_a, xmm_b, gpq, 0);
  cc.evex().vpinsrw(xmm_a, xmm_b, gpd, 0);
  cc.evex().vplzcntd(xmm_a, xmm_b);
  cc.evex().vplzcntd(ymm_a, ymm_b);
  cc.evex().vplzcntd(zmm_a, zmm_b);
  cc.evex().vplzcntq(xmm_a, xmm_b);
  cc.evex().vplzcntq(ymm_a, ymm_b);
  cc.evex().vplzcntq(zmm_a, zmm_b);
  cc.evex().vpmadd52huq(xmm_a, xmm_b, xmm_c);
  cc.evex().vpmadd52huq(ymm_a, ymm_b, ymm_c);
  cc.evex().vpmadd52huq(zmm_a, zmm_b, zmm_c);
  cc.evex().vpmadd52luq(xmm_a, xmm_b, xmm_c);
  cc.evex().vpmadd52luq(ymm_a, ymm_b, ymm_c);
  cc.evex().vpmadd52luq(zmm_a, zmm_b, zmm_c);
  cc.evex().vpmaddubsw(xmm_a, xmm_b, xmm_c);
  cc.evex().vpmaddubsw(ymm_a, ymm_b, ymm_c);
  cc.evex().vpmaddubsw(zmm_a, zmm_b, zmm_c);
  cc.evex().vpmaddwd(xmm_a, xmm_b, xmm_c);
  cc.evex().vpmaddwd(ymm_a, ymm_b, ymm_c);
  cc.evex().vpmaddwd(zmm_a, zmm_b, zmm_c);
  cc.evex().vpmaxsb(xmm_a, xmm_b, xmm_c);
  cc.evex().vpmaxsb(ymm_a, ymm_b, ymm_c);
  cc.evex().vpmaxsb(zmm_a, zmm_b, zmm_c);
  cc.evex().vpmaxsd(xmm_a, xmm_b, xmm_c);
  cc.evex().vpmaxsd(ymm_a, ymm_b, ymm_c);
  cc.evex().vpmaxsd(zmm_a, zmm_b, zmm_c);
  cc.evex().vpmaxsq(xmm_a, xmm_b, xmm_c);
  cc.evex().vpmaxsq(ymm_a, ymm_b, ymm_c);
  cc.evex().vpmaxsq(zmm_a, zmm_b, zmm_c);
  cc.evex().vpmaxsw(xmm_a, xmm_b, xmm_c);
  cc.evex().vpmaxsw(ymm_a, ymm_b, ymm_c);
  cc.evex().vpmaxsw(zmm_a, zmm_b, zmm_c);
  cc.evex().vpmaxub(xmm_a, xmm_b, xmm_c);
  cc.evex().vpmaxub(ymm_a, ymm_b, ymm_c);
  cc.evex().vpmaxub(zmm_a, zmm_b, zmm_c);
  cc.evex().vpmaxud(xmm_a, xmm_b, xmm_c);
  cc.evex().vpmaxud(ymm_a, ymm_b, ymm_c);
  cc.evex().vpmaxud(zmm_a, zmm_b, zmm_c);
  cc.evex().vpmaxuq(xmm_a, xmm_b, xmm_c);
  cc.evex().vpmaxuq(ymm_a, ymm_b, ymm_c);
  cc.evex().vpmaxuq(zmm_a, zmm_b, zmm_c);
  cc.evex().vpmaxuw(xmm_a, xmm_b, xmm_c);
  cc.evex().vpmaxuw(ymm_a, ymm_b, ymm_c);
  cc.evex().vpmaxuw(zmm_a, zmm_b, zmm_c);
  cc.evex().vpminsb(xmm_a, xmm_b, xmm_c);
  cc.evex().vpminsb(ymm_a, ymm_b, ymm_c);
  cc.evex().vpminsb(zmm_a, zmm_b, zmm_c);
  cc.evex().vpminsd(xmm_a, xmm_b, xmm_c);
  cc.evex().vpminsd(ymm_a, ymm_b, ymm_c);
  cc.evex().vpminsd(zmm_a, zmm_b, zmm_c);
  cc.evex().vpminsq(xmm_a, xmm_b, xmm_c);
  cc.evex().vpminsq(ymm_a, ymm_b, ymm_c);
  cc.evex().vpminsq(zmm_a, zmm_b, zmm_c);
  cc.evex().vpminsw(xmm_a, xmm_b, xmm_c);
  cc.evex().vpminsw(ymm_a, ymm_b, ymm_c);
  cc.evex().vpminsw(zmm_a, zmm_b, zmm_c);
  cc.evex().vpminub(xmm_a, xmm_b, xmm_c);
  cc.evex().vpminub(ymm_a, ymm_b, ymm_c);
  cc.evex().vpminub(zmm_a, zmm_b, zmm_c);
  cc.evex().vpminud(xmm_a, xmm_b, xmm_c);
  cc.evex().vpminud(ymm_a, ymm_b, ymm_c);
  cc.evex().vpminud(zmm_a, zmm_b, zmm_c);
  cc.evex().vpminuq(xmm_a, xmm_b, xmm_c);
  cc.evex().vpminuq(ymm_a, ymm_b, ymm_c);
  cc.evex().vpminuq(zmm_a, zmm_b, zmm_c);
  cc.evex().vpminuw(xmm_a, xmm_b, xmm_c);
  cc.evex().vpminuw(ymm_a, ymm_b, ymm_c);
  cc.evex().vpminuw(zmm_a, zmm_b, zmm_c);
  cc.evex().vpmovb2m(kA, xmm_b);
  cc.evex().vpmovb2m(kA, ymm_b);
  cc.evex().vpmovb2m(kA, zmm_b);
  cc.evex().vpmovd2m(kA, xmm_b);
  cc.evex().vpmovd2m(kA, ymm_b);
  cc.evex().vpmovd2m(kA, zmm_b);
  cc.evex().vpmovdb(xmm_a, xmm_b);
  cc.evex().vpmovdb(xmm_a, ymm_b);
  cc.evex().vpmovdb(xmm_a, zmm_b);
  cc.evex().vpmovdw(xmm_a, xmm_b);
  cc.evex().vpmovdw(xmm_a, ymm_b);
  cc.evex().vpmovdw(ymm_a, zmm_b);
  cc.evex().vpmovm2b(xmm_a, kB);
  cc.evex().vpmovm2b(ymm_a, kB);
  cc.evex().vpmovm2b(zmm_a, kB);
  cc.evex().vpmovm2d(xmm_a, kB);
  cc.evex().vpmovm2d(ymm_a, kB);
  cc.evex().vpmovm2d(zmm_a, kB);
  cc.evex().vpmovm2q(xmm_a, kB);
  cc.evex().vpmovm2q(ymm_a, kB);
  cc.evex().vpmovm2q(zmm_a, kB);
  cc.evex().vpmovm2w(xmm_a, kB);
  cc.evex().vpmovm2w(ymm_a, kB);
  cc.evex().vpmovm2w(zmm_a, kB);
  cc.evex().vpmovq2m(kA, xmm_b);
  cc.evex().vpmovq2m(kA, ymm_b);
  cc.evex().vpmovq2m(kA, zmm_b);
  cc.evex().vpmovqb(xmm_a, xmm_b);
  cc.evex().vpmovqb(xmm_a, ymm_b);
  cc.evex().vpmovqb(xmm_a, zmm_b);
  cc.evex().vpmovqd(xmm_a, xmm_b);
  cc.evex().vpmovqd(xmm_a, ymm_b);
  cc.evex().vpmovqd(ymm_a, zmm_b);
  cc.evex().vpmovqw(xmm_a, xmm_b);
  cc.evex().vpmovqw(xmm_a, ymm_b);
  cc.evex().vpmovqw(xmm_a, zmm_b);
  cc.evex().vpmovsdb(xmm_a, xmm_b);
  cc.evex().vpmovsdb(xmm_a, ymm_b);
  cc.evex().vpmovsdb(xmm_a, zmm_b);
  cc.evex().vpmovsdw(xmm_a, xmm_b);
  cc.evex().vpmovsdw(xmm_a, ymm_b);
  cc.evex().vpmovsdw(ymm_a, zmm_b);
  cc.evex().vpmovsqb(xmm_a, xmm_b);
  cc.evex().vpmovsqb(xmm_a, ymm_b);
  cc.evex().vpmovsqb(xmm_a, zmm_b);
  cc.evex().vpmovsqd(xmm_a, xmm_b);
  cc.evex().vpmovsqd(xmm_a, ymm_b);
  cc.evex().vpmovsqd(ymm_a, zmm_b);
  cc.evex().vpmovsqw(xmm_a, xmm_b);
  cc.evex().vpmovsqw(xmm_a, ymm_b);
  cc.evex().vpmovsqw(xmm_a, zmm_b);
  cc.evex().vpmovswb(xmm_a, xmm_b);
  cc.evex().vpmovswb(xmm_a, ymm_b);
  cc.evex().vpmovswb(ymm_a, zmm_b);
  cc.evex().vpmovsxbd(xmm_a, xmm_b);
  cc.evex().vpmovsxbd(ymm_a, xmm_b);
  cc.evex().vpmovsxbd(zmm_a, xmm_b);
  cc.evex().vpmovsxbq(xmm_a, xmm_b);
  cc.evex().vpmovsxbq(ymm_a, xmm_b);
  cc.evex().vpmovsxbq(zmm_a, xmm_b);
  cc.evex().vpmovsxbw(xmm_a, xmm_b);
  cc.evex().vpmovsxbw(ymm_a, xmm_b);
  cc.evex().vpmovsxbw(zmm_a, ymm_b);
  cc.evex().vpmovsxdq(xmm_a, xmm_b);
  cc.evex().vpmovsxdq(ymm_a, xmm_b);
  cc.evex().vpmovsxdq(zmm_a, ymm_b);
  cc.evex().vpmovsxwd(xmm_a, xmm_b);
  cc.evex().vpmovsxwd(ymm_a, xmm_b);
  cc.evex().vpmovsxwd(zmm_a, ymm_b);
  cc.evex().vpmovsxwq(xmm_a, xmm_b);
  cc.evex().vpmovsxwq(ymm_a, xmm_b);
  cc.evex().vpmovsxwq(zmm_a, xmm_b);
  cc.evex().vpmovusdb(xmm_a, xmm_b);
  cc.evex().vpmovusdb(xmm_a, ymm_b);
  cc.evex().vpmovusdb(xmm_a, zmm_b);
  cc.evex().vpmovusdw(xmm_a, xmm_b);
  cc.evex().vpmovusdw(xmm_a, ymm_b);
  cc.evex().vpmovusdw(ymm_a, zmm_b);
  cc.evex().vpmovusqb(xmm_a, xmm_b);
  cc.evex().vpmovusqb(xmm_a, ymm_b);
  cc.evex().vpmovusqb(xmm_a, zmm_b);
  cc.evex().vpmovusqd(xmm_a, xmm_b);
  cc.evex().vpmovusqd(xmm_a, ymm_b);
  cc.evex().vpmovusqd(ymm_a, zmm_b);
  cc.evex().vpmovusqw(xmm_a, xmm_b);
  cc.evex().vpmovusqw(xmm_a, ymm_b);
  cc.evex().vpmovusqw(xmm_a, zmm_b);
  cc.evex().vpmovuswb(xmm_a, xmm_b);
  cc.evex().vpmovuswb(xmm_a, ymm_b);
  cc.evex().vpmovuswb(ymm_a, zmm_b);
  cc.evex().vpmovw2m(kA, xmm_b);
  cc.evex().vpmovw2m(kA, ymm_b);
  cc.evex().vpmovw2m(kA, zmm_b);
  cc.evex().vpmovwb(xmm_a, xmm_b);
  cc.evex().vpmovwb(xmm_a, ymm_b);
  cc.evex().vpmovwb(ymm_a, zmm_b);
  cc.evex().vpmovzxbd(xmm_a, xmm_b);
  cc.evex().vpmovzxbd(ymm_a, xmm_b);
  cc.evex().vpmovzxbd(zmm_a, xmm_b);
  cc.evex().vpmovzxbq(xmm_a, xmm_b);
  cc.evex().vpmovzxbq(ymm_a, xmm_b);
  cc.evex().vpmovzxbq(zmm_a, xmm_b);
  cc.evex().vpmovzxbw(xmm_a, xmm_b);
  cc.evex().vpmovzxbw(ymm_a, xmm_b);
  cc.evex().vpmovzxbw(zmm_a, ymm_b);
  cc.evex().vpmovzxdq(xmm_a, xmm_b);
  cc.evex().vpmovzxdq(ymm_a, xmm_b);
  cc.evex().vpmovzxdq(zmm_a, ymm_b);
  cc.evex().vpmovzxwd(xmm_a, xmm_b);
  cc.evex().vpmovzxwd(ymm_a, xmm_b);
  cc.evex().vpmovzxwd(zmm_a, ymm_b);
  cc.evex().vpmovzxwq(xmm_a, xmm_b);
  cc.evex().vpmovzxwq(ymm_a, xmm_b);
  cc.evex().vpmovzxwq(zmm_a, xmm_b);
  cc.evex().vpmuldq(xmm_a, xmm_b, xmm_c);
  cc.evex().vpmuldq(ymm_a, ymm_b, ymm_c);
  cc.evex().vpmuldq(zmm_a, zmm_b, zmm_c);
  cc.evex().vpmulhrsw(xmm_a, xmm_b, xmm_c);
  cc.evex().vpmulhrsw(ymm_a, ymm_b, ymm_c);
  cc.evex().vpmulhrsw(zmm_a, zmm_b, zmm_c);
  cc.evex().vpmulhuw(xmm_a, xmm_b, xmm_c);
  cc.evex().vpmulhuw(ymm_a, ymm_b, ymm_c);
  cc.evex().vpmulhuw(zmm_a, zmm_b, zmm_c);
  cc.evex().vpmulhw(xmm_a, xmm_b, xmm_c);
  cc.evex().vpmulhw(ymm_a, ymm_b, ymm_c);
  cc.evex().vpmulhw(zmm_a, zmm_b, zmm_c);
  cc.evex().vpmulld(xmm_a, xmm_b, xmm_c);
  cc.evex().vpmulld(ymm_a, ymm_b, ymm_c);
  cc.evex().vpmulld(zmm_a, zmm_b, zmm_c);
  cc.evex().vpmullq(xmm_a, xmm_b, xmm_c);
  cc.evex().vpmullq(ymm_a, ymm_b, ymm_c);
  cc.evex().vpmullq(zmm_a, zmm_b, zmm_c);
  cc.evex().vpmullw(xmm_a, xmm_b, xmm_c);
  cc.evex().vpmullw(ymm_a, ymm_b, ymm_c);
  cc.evex().vpmullw(zmm_a, zmm_b, zmm_c);
  cc.evex().vpmultishiftqb(xmm_a, xmm_b, xmm_c);
  cc.evex().vpmultishiftqb(ymm_a, ymm_b, ymm_c);
  cc.evex().vpmultishiftqb(zmm_a, zmm_b, zmm_c);
  cc.evex().vpmuludq(xmm_a, xmm_b, xmm_c);
  cc.evex().vpmuludq(ymm_a, ymm_b, ymm_c);
  cc.evex().vpmuludq(zmm_a, zmm_b, zmm_c);
  cc.evex().vpopcntd(zmm_a, zmm_b);
  cc.evex().vpopcntq(zmm_a, zmm_b);
  cc.evex().vpord(xmm_a, xmm_b, xmm_c);
  cc.evex().vpord(ymm_a, ymm_b, ymm_c);
  cc.evex().vpord(zmm_a, zmm_b, zmm_c);
  cc.evex().vporq(xmm_a, xmm_b, xmm_c);
  cc.evex().vporq(ymm_a, ymm_b, ymm_c);
  cc.evex().vporq(zmm_a, zmm_b, zmm_c);
  cc.evex().vprold(xmm_a, xmm_b, 0);
  cc.evex().vprold(ymm_a, ymm_b, 0);
  cc.evex().vprold(zmm_a, zmm_b, 0);
  cc.evex().vprolq(xmm_a, xmm_b, 0);
  cc.evex().vprolq(ymm_a, ymm_b, 0);
  cc.evex().vprolq(zmm_a, zmm_b, 0);
  cc.evex().vprolvd(xmm_a, xmm_b, xmm_c);
  cc.evex().vprolvd(ymm_a, ymm_b, ymm_c);
  cc.evex().vprolvd(zmm_a, zmm_b, zmm_c);
  cc.evex().vprolvq(xmm_a, xmm_b, xmm_c);
  cc.evex().vprolvq(ymm_a, ymm_b, ymm_c);
  cc.evex().vprolvq(zmm_a, zmm_b, zmm_c);
  cc.evex().vprord(xmm_a, xmm_b, 0);
  cc.evex().vprord(ymm_a, ymm_b, 0);
  cc.evex().vprord(zmm_a, zmm_b, 0);
  cc.evex().vprorq(xmm_a, xmm_b, 0);
  cc.evex().vprorq(ymm_a, ymm_b, 0);
  cc.evex().vprorq(zmm_a, zmm_b, 0);
  cc.evex().vprorvd(xmm_a, xmm_b, xmm_c);
  cc.evex().vprorvd(ymm_a, ymm_b, ymm_c);
  cc.evex().vprorvd(zmm_a, zmm_b, zmm_c);
  cc.evex().vprorvq(xmm_a, xmm_b, xmm_c);
  cc.evex().vprorvq(ymm_a, ymm_b, ymm_c);
  cc.evex().vprorvq(zmm_a, zmm_b, zmm_c);
  cc.evex().vpsadbw(xmm_a, xmm_b, xmm_c);
  cc.evex().vpsadbw(ymm_a, ymm_b, ymm_c);
  cc.evex().vpsadbw(zmm_a, zmm_b, zmm_c);
  cc.evex().vpshufb(xmm_a, xmm_b, xmm_c);
  cc.evex().vpshufb(ymm_a, ymm_b, ymm_c);
  cc.evex().vpshufb(zmm_a, zmm_b, zmm_c);
  cc.evex().vpshufd(xmm_a, xmm_b, 0);
  cc.evex().vpshufd(ymm_a, ymm_b, 0);
  cc.evex().vpshufd(zmm_a, zmm_b, 0);
  cc.evex().vpshufhw(xmm_a, xmm_b, 0);
  cc.evex().vpshufhw(ymm_a, ymm_b, 0);
  cc.evex().vpshufhw(zmm_a, zmm_b, 0);
  cc.evex().vpshuflw(xmm_a, xmm_b, 0);
  cc.evex().vpshuflw(ymm_a, ymm_b, 0);
  cc.evex().vpshuflw(zmm_a, zmm_b, 0);
  cc.evex().vpslld(xmm_a, xmm_b, xmm_c);
  cc.evex().vpslld(xmm_a, xmm_b, 0);
  cc.evex().vpslld(ymm_a, ymm_b, xmm_c);
  cc.evex().vpslld(ymm_a, ymm_b, 0);
  cc.evex().vpslld(zmm_a, zmm_b, xmm_c);
  cc.evex().vpslld(zmm_a, zmm_b, 0);
  cc.evex().vpslldq(xmm_a, xmm_b, 0);
  cc.evex().vpslldq(ymm_a, ymm_b, 0);
  cc.evex().vpslldq(zmm_a, zmm_b, 0);
  cc.evex().vpsllq(xmm_a, xmm_b, xmm_c);
  cc.evex().vpsllq(xmm_a, xmm_b, 0);
  cc.evex().vpsllq(ymm_a, ymm_b, xmm_c);
  cc.evex().vpsllq(ymm_a, ymm_b, 0);
  cc.evex().vpsllq(zmm_a, zmm_b, xmm_c);
  cc.evex().vpsllq(zmm_a, zmm_b, 0);
  cc.evex().vpsllvd(xmm_a, xmm_b, xmm_c);
  cc.evex().vpsllvd(ymm_a, ymm_b, ymm_c);
  cc.evex().vpsllvd(zmm_a, zmm_b, zmm_c);
  cc.evex().vpsllvq(xmm_a, xmm_b, xmm_c);
  cc.evex().vpsllvq(ymm_a, ymm_b, ymm_c);
  cc.evex().vpsllvq(zmm_a, zmm_b, zmm_c);
  cc.evex().vpsllvw(xmm_a, xmm_b, xmm_c);
  cc.evex().vpsllvw(ymm_a, ymm_b, ymm_c);
  cc.evex().vpsllvw(zmm_a, zmm_b, zmm_c);
  cc.evex().vpsllw(xmm_a, xmm_b, xmm_c);
  cc.evex().vpsllw(xmm_a, xmm_b, 0);
  cc.evex().vpsllw(ymm_a, ymm_b, xmm_c);
  cc.evex().vpsllw(ymm_a, ymm_b, 0);
  cc.evex().vpsllw(zmm_a, zmm_b, xmm_c);
  cc.evex().vpsllw(zmm_a, zmm_b, 0);
  cc.evex().vpsrad(xmm_a, xmm_b, xmm_c);
  cc.evex().vpsrad(xmm_a, xmm_b, 0);
  cc.evex().vpsrad(ymm_a, ymm_b, xmm_c);
  cc.evex().vpsrad(ymm_a, ymm_b, 0);
  cc.evex().vpsrad(zmm_a, zmm_b, xmm_c);
  cc.evex().vpsrad(zmm_a, zmm_b, 0);
  cc.evex().vpsraq(xmm_a, xmm_b, xmm_c);
  cc.evex().vpsraq(xmm_a, xmm_b, 0);
  cc.evex().vpsraq(ymm_a, ymm_b, xmm_c);
  cc.evex().vpsraq(ymm_a, ymm_b, 0);
  cc.evex().vpsraq(zmm_a, zmm_b, xmm_c);
  cc.evex().vpsraq(zmm_a, zmm_b, 0);
  cc.evex().vpsravd(xmm_a, xmm_b, xmm_c);
  cc.evex().vpsravd(ymm_a, ymm_b, ymm_c);
  cc.evex().vpsravd(zmm_a, zmm_b, zmm_c);
  cc.evex().vpsravq(xmm_a, xmm_b, xmm_c);
  cc.evex().vpsravq(ymm_a, ymm_b, ymm_c);
  cc.evex().vpsravq(zmm_a, zmm_b, zmm_c);
  cc.evex().vpsravw(xmm_a, xmm_b, xmm_c);
  cc.evex().vpsravw(ymm_a, ymm_b, ymm_c);
  cc.evex().vpsravw(zmm_a, zmm_b, zmm_c);
  cc.evex().vpsraw(xmm_a, xmm_b, xmm_c);
  cc.evex().vpsraw(xmm_a, xmm_b, 0);
  cc.evex().vpsraw(ymm_a, ymm_b, xmm_c);
  cc.evex().vpsraw(ymm_a, ymm_b, 0);
  cc.evex().vpsraw(zmm_a, zmm_b, xmm_c);
  cc.evex().vpsraw(zmm_a, zmm_b, 0);
  cc.evex().vpsrld(xmm_a, xmm_b, xmm_c);
  cc.evex().vpsrld(xmm_a, xmm_b, 0);
  cc.evex().vpsrld(ymm_a, ymm_b, xmm_c);
  cc.evex().vpsrld(ymm_a, ymm_b, 0);
  cc.evex().vpsrld(zmm_a, zmm_b, xmm_c);
  cc.evex().vpsrld(zmm_a, zmm_b, 0);
  cc.evex().vpsrldq(xmm_a, xmm_b, 0);
  cc.evex().vpsrldq(ymm_a, ymm_b, 0);
  cc.evex().vpsrldq(zmm_a, zmm_b, 0);
  cc.evex().vpsrlq(xmm_a, xmm_b, xmm_c);
  cc.evex().vpsrlq(xmm_a, xmm_b, 0);
  cc.evex().vpsrlq(ymm_a, ymm_b, xmm_c);
  cc.evex().vpsrlq(ymm_a, ymm_b, 0);
  cc.evex().vpsrlq(zmm_a, zmm_b, xmm_c);
  cc.evex().vpsrlq(zmm_a, zmm_b, 0);
  cc.evex().vpsrlvd(xmm_a, xmm_b, xmm_c);
  cc.evex().vpsrlvd(ymm_a, ymm_b, ymm_c);
  cc.evex().vpsrlvd(zmm_a, zmm_b, zmm_c);
  cc.evex().vpsrlvq(xmm_a, xmm_b, xmm_c);
  cc.evex().vpsrlvq(ymm_a, ymm_b, ymm_c);
  cc.evex().vpsrlvq(zmm_a, zmm_b, zmm_c);
  cc.evex().vpsrlvw(xmm_a, xmm_b, xmm_c);
  cc.evex().vpsrlvw(ymm_a, ymm_b, ymm_c);
  cc.evex().vpsrlvw(zmm_a, zmm_b, zmm_c);
  cc.evex().vpsrlw(xmm_a, xmm_b, xmm_c);
  cc.evex().vpsrlw(xmm_a, xmm_b, 0);
  cc.evex().vpsrlw(ymm_a, ymm_b, xmm_c);
  cc.evex().vpsrlw(ymm_a, ymm_b, 0);
  cc.evex().vpsrlw(zmm_a, zmm_b, xmm_c);
  cc.evex().vpsrlw(zmm_a, zmm_b, 0);
  cc.evex().vpsubb(xmm_a, xmm_b, xmm_c);
  cc.evex().vpsubb(ymm_a, ymm_b, ymm_c);
  cc.evex().vpsubb(zmm_a, zmm_b, zmm_c);
  cc.evex().vpsubd(xmm_a, xmm_b, xmm_c);
  cc.evex().vpsubd(ymm_a, ymm_b, ymm_c);
  cc.evex().vpsubd(zmm_a, zmm_b, zmm_c);
  cc.evex().vpsubq(xmm_a, xmm_b, xmm_c);
  cc.evex().vpsubq(ymm_a, ymm_b, ymm_c);
  cc.evex().vpsubq(zmm_a, zmm_b, zmm_c);
  cc.evex().vpsubsb(xmm_a, xmm_b, xmm_c);
  cc.evex().vpsubsb(ymm_a, ymm_b, ymm_c);
  cc.evex().vpsubsb(zmm_a, zmm_b, zmm_c);
  cc.evex().vpsubsw(xmm_a, xmm_b, xmm_c);
  cc.evex().vpsubsw(ymm_a, ymm_b, ymm_c);
  cc.evex().vpsubsw(zmm_a, zmm_b, zmm_c);
  cc.evex().vpsubusb(xmm_a, xmm_b, xmm_c);
  cc.evex().vpsubusb(ymm_a, ymm_b, ymm_c);
  cc.evex().vpsubusb(zmm_a, zmm_b, zmm_c);
  cc.evex().vpsubusw(xmm_a, xmm_b, xmm_c);
  cc.evex().vpsubusw(ymm_a, ymm_b, ymm_c);
  cc.evex().vpsubusw(zmm_a, zmm_b, zmm_c);
  cc.evex().vpsubw(xmm_a, xmm_b, xmm_c);
  cc.evex().vpsubw(ymm_a, ymm_b, ymm_c);
  cc.evex().vpsubw(zmm_a, zmm_b, zmm_c);
  cc.evex().vpternlogd(xmm_a, xmm_b, xmm_c, 0);
  cc.evex().vpternlogd(ymm_a, ymm_b, ymm_c, 0);
  cc.evex().vpternlogd(zmm_a, zmm_b, zmm_c, 0);
  cc.evex().vpternlogq(xmm_a, xmm_b, xmm_c, 0);
  cc.evex().vpternlogq(ymm_a, ymm_b, ymm_c, 0);
  cc.evex().vpternlogq(zmm_a, zmm_b, zmm_c, 0);
  cc.evex().vptestmb(kA, xmm_b, xmm_c);
  cc.evex().vptestmb(kA, ymm_b, ymm_c);
  cc.evex().vptestmb(kA, zmm_b, zmm_c);
  cc.evex().vptestmd(kA, xmm_b, xmm_c);
  cc.evex().vptestmd(kA, ymm_b, ymm_c);
  cc.evex().vptestmd(kA, zmm_b, zmm_c);
  cc.evex().vptestmq(kA, xmm_b, xmm_c);
  cc.evex().vptestmq(kA, ymm_b, ymm_c);
  cc.evex().vptestmq(kA, zmm_b, zmm_c);
  cc.evex().vptestmw(kA, xmm_b, xmm_c);
  cc.evex().vptestmw(kA, ymm_b, ymm_c);
  cc.evex().vptestmw(kA, zmm_b, zmm_c);
  cc.evex().vptestnmb(kA, xmm_b, xmm_c);
  cc.evex().vptestnmb(kA, ymm_b, ymm_c);
  cc.evex().vptestnmb(kA, zmm_b, zmm_c);
  cc.evex().vptestnmd(kA, xmm_b, xmm_c);
  cc.evex().vptestnmd(kA, ymm_b, ymm_c);
  cc.evex().vptestnmd(kA, zmm_b, zmm_c);
  cc.evex().vptestnmq(kA, xmm_b, xmm_c);
  cc.evex().vptestnmq(kA, ymm_b, ymm_c);
  cc.evex().vptestnmq(kA, zmm_b, zmm_c);
  cc.evex().vptestnmw(kA, xmm_b, xmm_c);
  cc.evex().vptestnmw(kA, ymm_b, ymm_c);
  cc.evex().vptestnmw(kA, zmm_b, zmm_c);
  cc.evex().vpunpckhbw(xmm_a, xmm_b, xmm_c);
  cc.evex().vpunpckhbw(ymm_a, ymm_b, ymm_c);
  cc.evex().vpunpckhbw(zmm_a, zmm_b, zmm_c);
  cc.evex().vpunpckhdq(xmm_a, xmm_b, xmm_c);
  cc.evex().vpunpckhdq(ymm_a, ymm_b, ymm_c);
  cc.evex().vpunpckhdq(zmm_a, zmm_b, zmm_c);
  cc.evex().vpunpckhqdq(xmm_a, xmm_b, xmm_c);
  cc.evex().vpunpckhqdq(ymm_a, ymm_b, ymm_c);
  cc.evex().vpunpckhqdq(zmm_a, zmm_b, zmm_c);
  cc.evex().vpunpckhwd(xmm_a, xmm_b, xmm_c);
  cc.evex().vpunpckhwd(ymm_a, ymm_b, ymm_c);
  cc.evex().vpunpckhwd(zmm_a, zmm_b, zmm_c);
  cc.evex().vpunpcklbw(xmm_a, xmm_b, xmm_c);
  cc.evex().vpunpcklbw(ymm_a, ymm_b, ymm_c);
  cc.evex().vpunpcklbw(zmm_a, zmm_b, zmm_c);
  cc.evex().vpunpckldq(xmm_a, xmm_b, xmm_c);
  cc.evex().vpunpckldq(ymm_a, ymm_b, ymm_c);
  cc.evex().vpunpckldq(zmm_a, zmm_b, zmm_c);
  cc.evex().vpunpcklqdq(xmm_a, xmm_b, xmm_c);
  cc.evex().vpunpcklqdq(ymm_a, ymm_b, ymm_c);
  cc.evex().vpunpcklqdq(zmm_a, zmm_b, zmm_c);
  cc.evex().vpunpcklwd(xmm_a, xmm_b, xmm_c);
  cc.evex().vpunpcklwd(ymm_a, ymm_b, ymm_c);
  cc.evex().vpunpcklwd(zmm_a, zmm_b, zmm_c);
  cc.evex().vpxord(xmm_a, xmm_b, xmm_c);
  cc.evex().vpxord(ymm_a, ymm_b, ymm_c);
  cc.evex().vpxord(zmm_a, zmm_b, zmm_c);
  cc.evex().vpxorq(xmm_a, xmm_b, xmm_c);
  cc.evex().vpxorq(ymm_a, ymm_b, ymm_c);
  cc.evex().vpxorq(zmm_a, zmm_b, zmm_c);
  cc.evex().vrangepd(xmm_a, xmm_b, xmm_c, 0);
  cc.evex().vrangepd(ymm_a, ymm_b, ymm_c, 0);
  cc.evex().vrangepd(zmm_a, zmm_b, zmm_c, 0);
  cc.evex().vrangeps(xmm_a, xmm_b, xmm_c, 0);
  cc.evex().vrangeps(ymm_a, ymm_b, ymm_c, 0);
  cc.evex().vrangeps(zmm_a, zmm_b, zmm_c, 0);
  cc.evex().vrangesd(xmm_a, xmm_b, xmm_c, 0);
  cc.evex().vrangess(xmm_a, xmm_b, xmm_c, 0);
  cc.evex().vrcp14pd(xmm_a, xmm_b);
  cc.evex().vrcp14pd(ymm_a, ymm_b);
  cc.evex().vrcp14pd(zmm_a, zmm_b);
  cc.evex().vrcp14ps(xmm_a, xmm_b);
  cc.evex().vrcp14ps(ymm_a, ymm_b);
  cc.evex().vrcp14ps(zmm_a, zmm_b);
  cc.evex().vrcp14sd(xmm_a, xmm_b, xmm_c);
  cc.evex().vrcp14ss(xmm_a, xmm_b, xmm_c);
  cc.evex().vreducepd(xmm_a, xmm_b, 0);
  cc.evex().vreducepd(ymm_a, ymm_b, 0);
  cc.evex().vreducepd(zmm_a, zmm_b, 0);
  cc.evex().vreduceps(xmm_a, xmm_b, 0);
  cc.evex().vreduceps(ymm_a, ymm_b, 0);
  cc.evex().vreduceps(zmm_a, zmm_b, 0);
  cc.evex().vreducesd(xmm_a, xmm_b, xmm_c, 0);
  cc.evex().vreducess(xmm_a, xmm_b, xmm_c, 0);
  cc.evex().vrndscalepd(xmm_a, xmm_b, 0);
  cc.evex().vrndscalepd(ymm_a, ymm_b, 0);
  cc.evex().vrndscalepd(zmm_a, zmm_b, 0);
  cc.evex().vrndscaleps(xmm_a, xmm_b, 0);
  cc.evex().vrndscaleps(ymm_a, ymm_b, 0);
  cc.evex().vrndscaleps(zmm_a, zmm_b, 0);
  cc.evex().vrndscalesd(xmm_a, xmm_b, xmm_c, 0);
  cc.evex().vrndscaless(xmm_a, xmm_b, xmm_c, 0);
  cc.evex().vrsqrt14pd(xmm_a, xmm_b);
  cc.evex().vrsqrt14pd(ymm_a, ymm_b);
  cc.evex().vrsqrt14pd(zmm_a, zmm_b);
  cc.evex().vrsqrt14ps(xmm_a, xmm_b);
  cc.evex().vrsqrt14ps(ymm_a, ymm_b);
  cc.evex().vrsqrt14ps(zmm_a, zmm_b);
  cc.evex().vrsqrt14sd(xmm_a, xmm_b, xmm_c);
  cc.evex().vrsqrt14ss(xmm_a, xmm_b, xmm_c);
  cc.evex().vscalefpd(xmm_a, xmm_b, xmm_c);
  cc.evex().vscalefpd(ymm_a, ymm_b, ymm_c);
  cc.evex().vscalefpd(zmm_a, zmm_b, zmm_c);
  cc.evex().vscalefps(xmm_a, xmm_b, xmm_c);
  cc.evex().vscalefps(ymm_a, ymm_b, ymm_c);
  cc.evex().vscalefps(zmm_a, zmm_b, zmm_c);
  cc.evex().vscalefsd(xmm_a, xmm_b, xmm_c);
  cc.evex().vscalefss(xmm_a, xmm_b, xmm_c);
  cc.evex().vshuff32x4(ymm_a, ymm_b, ymm_c, 0);
  cc.evex().vshuff32x4(zmm_a, zmm_b, zmm_c, 0);
  cc.evex().vshuff64x2(ymm_a, ymm_b, ymm_c, 0);
  cc.evex().vshuff64x2(zmm_a, zmm_b, zmm_c, 0);
  cc.evex().vshufi32x4(ymm_a, ymm_b, ymm_c, 0);
  cc.evex().vshufi32x4(zmm_a, zmm_b, zmm_c, 0);
  cc.evex().vshufi64x2(ymm_a, ymm_b, ymm_c, 0);
  cc.evex().vshufi64x2(zmm_a, zmm_b, zmm_c, 0);
  cc.evex().vshufpd(xmm_a, xmm_b, xmm_c, 0);
  cc.evex().vshufpd(ymm_a, ymm_b, ymm_c, 0);
  cc.evex().vshufpd(zmm_a, zmm_b, zmm_c, 0);
  cc.evex().vshufps(xmm_a, xmm_b, xmm_c, 0);
  cc.evex().vshufps(ymm_a, ymm_b, ymm_c, 0);
  cc.evex().vshufps(zmm_a, zmm_b, zmm_c, 0);
  cc.evex().vsqrtpd(xmm_a, xmm_b);
  cc.evex().vsqrtpd(ymm_a, ymm_b);
  cc.evex().vsqrtpd(zmm_a, zmm_b);
  cc.evex().vsqrtps(xmm_a, xmm_b);
  cc.evex().vsqrtps(ymm_a, ymm_b);
  cc.evex().vsqrtps(zmm_a, zmm_b);
  cc.evex().vsqrtsd(xmm_a, xmm_b, xmm_c);
  cc.evex().vsqrtss(xmm_a, xmm_b, xmm_c);
  cc.evex().vsubpd(xmm_a, xmm_b, xmm_c);
  cc.evex().vsubpd(ymm_a, ymm_b, ymm_c);
  cc.evex().vsubpd(zmm_a, zmm_b, zmm_c);
  cc.evex().vsubps(xmm_a, xmm_b, xmm_c);
  cc.evex().vsubps(ymm_a, ymm_b, ymm_c);
  cc.evex().vsubps(zmm_a, zmm_b, zmm_c);
  cc.evex().vsubsd(xmm_a, xmm_b, xmm_c);
  cc.evex().vsubss(xmm_a, xmm_b, xmm_c);
  cc.evex().vucomisd(xmm_a, xmm_b);
  cc.evex().vucomiss(xmm_a, xmm_b);
  cc.evex().vunpckhpd(xmm_a, xmm_b, xmm_c);
  cc.evex().vunpckhpd(ymm_a, ymm_b, ymm_c);
  cc.evex().vunpckhpd(zmm_a, zmm_b, zmm_c);
  cc.evex().vunpckhps(xmm_a, xmm_b, xmm_c);
  cc.evex().vunpckhps(ymm_a, ymm_b, ymm_c);
  cc.evex().vunpckhps(zmm_a, zmm_b, zmm_c);
  cc.evex().vunpcklpd(xmm_a, xmm_b, xmm_c);
  cc.evex().vunpcklpd(ymm_a, ymm_b, ymm_c);
  cc.evex().vunpcklpd(zmm_a, zmm_b, zmm_c);
  cc.evex().vunpcklps(xmm_a, xmm_b, xmm_c);
  cc.evex().vunpcklps(ymm_a, ymm_b, ymm_c);
  cc.evex().vunpcklps(zmm_a, zmm_b, zmm_c);
  cc.evex().vxorpd(xmm_a, xmm_b, xmm_c);
  cc.evex().vxorpd(ymm_a, ymm_b, ymm_c);
  cc.evex().vxorpd(zmm_a, zmm_b, zmm_c);
  cc.evex().vxorps(xmm_a, xmm_b, xmm_c);
  cc.evex().vxorps(ymm_a, ymm_b, ymm_c);
  cc.evex().vxorps(zmm_a, zmm_b, zmm_c);
}

template<typename Emitter>
static void generate_avx512_sequence_internal_reg_mem(
  Emitter& cc,
  const x86::Gp& gp,
  const x86::KReg& kA, const x86::KReg& kB, const x86::KReg& kC,
  const x86::Vec& vec_a, const x86::Vec& vec_b, const x86::Vec& vec_c, const x86::Vec& vec_d) {

  Support::maybe_unused(kC);

  x86::Gp gpd = gp.r32();
  x86::Gp gpq = gp.r64();
  x86::Gp gpz = cc.is_32bit() ? gpd : gpq;

  x86::Vec xmm_a = vec_a.xmm();
  x86::Vec xmm_b = vec_b.xmm();
  x86::Vec xmm_c = vec_c.xmm();
  x86::Vec xmm_d = vec_d.xmm();

  x86::Vec ymm_a = vec_a.ymm();
  x86::Vec ymm_b = vec_b.ymm();
  x86::Vec ymm_d = vec_d.ymm();

  x86::Vec zmm_a = vec_a.zmm();
  x86::Vec zmm_b = vec_b.zmm();
  x86::Vec zmm_d = vec_d.zmm();

  x86::Mem m = x86::ptr(gpz);
  x86::Mem m32 = x86::dword_ptr(gpz);
  x86::Mem m64 = x86::qword_ptr(gpz);
  x86::Mem m128 = x86::xmmword_ptr(gpz);
  x86::Mem m256 = x86::ymmword_ptr(gpz);
  x86::Mem m512 = x86::zmmword_ptr(gpz);
  x86::Mem vx_ptr = x86::ptr(gpz, xmm_d);
  x86::Mem vy_ptr = x86::ptr(gpz, ymm_d);
  x86::Mem vz_ptr = x86::ptr(gpz, zmm_d);

  cc.xor_(gpd, gpd);
  cc.vxorps(xmm_a, xmm_a, xmm_a);
  cc.vxorps(xmm_b, xmm_b, xmm_b);
  cc.vxorps(xmm_c, xmm_c, xmm_c);
  cc.vxorps(xmm_d, xmm_d, xmm_d);

  cc.kmovb(kA, m);
  cc.kmovb(m, kB);
  cc.kmovd(kA, m);
  cc.kmovd(m, kB);
  cc.kmovq(kA, m);
  cc.kmovq(m, kB);
  cc.kmovw(kA, m);
  cc.kmovw(m, kB);

  cc.evex().vaddpd(xmm_a, xmm_b, m);
  cc.evex().vaddpd(ymm_a, ymm_b, m);
  cc.evex().vaddpd(zmm_a, zmm_b, m);
  cc.evex().vaddps(xmm_a, xmm_b, m);
  cc.evex().vaddps(ymm_a, ymm_b, m);
  cc.evex().vaddps(zmm_a, zmm_b, m);
  cc.evex().vaddsd(xmm_a, xmm_b, m);
  cc.evex().vaddss(xmm_a, xmm_b, m);
  cc.evex().valignd(xmm_a, xmm_b, m, 0);
  cc.evex().valignd(ymm_a, ymm_b, m, 0);
  cc.evex().valignd(zmm_a, zmm_b, m, 0);
  cc.evex().valignq(xmm_a, xmm_b, m, 0);
  cc.evex().valignq(ymm_a, ymm_b, m, 0);
  cc.evex().valignq(zmm_a, zmm_b, m, 0);
  cc.evex().vandnpd(xmm_a, xmm_b, m);
  cc.evex().vandnpd(ymm_a, ymm_b, m);
  cc.evex().vandnpd(zmm_a, zmm_b, m);
  cc.evex().vandnps(xmm_a, xmm_b, m);
  cc.evex().vandnps(ymm_a, ymm_b, m);
  cc.evex().vandnps(zmm_a, zmm_b, m);
  cc.evex().vandpd(xmm_a, xmm_b, m);
  cc.evex().vandpd(ymm_a, ymm_b, m);
  cc.evex().vandpd(zmm_a, zmm_b, m);
  cc.evex().vandps(xmm_a, xmm_b, m);
  cc.evex().vandps(ymm_a, ymm_b, m);
  cc.evex().vandps(zmm_a, zmm_b, m);
  cc.evex().vblendmpd(xmm_a, xmm_b, m);
  cc.evex().vblendmpd(ymm_a, ymm_b, m);
  cc.evex().vblendmpd(zmm_a, zmm_b, m);
  cc.evex().vblendmps(xmm_a, xmm_b, m);
  cc.evex().vblendmps(ymm_a, ymm_b, m);
  cc.evex().vblendmps(zmm_a, zmm_b, m);
  cc.evex().vbroadcastf32x2(ymm_a, m);
  cc.evex().vbroadcastf32x2(zmm_a, m);
  cc.evex().vbroadcastf32x4(ymm_a, m);
  cc.evex().vbroadcastf32x4(zmm_a, m);
  cc.evex().vbroadcastf32x8(zmm_a, m);
  cc.evex().vbroadcastf64x2(ymm_a, m);
  cc.evex().vbroadcastf64x2(zmm_a, m);
  cc.evex().vbroadcastf64x4(zmm_a, m);
  cc.evex().vbroadcasti32x2(xmm_a, m);
  cc.evex().vbroadcasti32x2(ymm_a, m);
  cc.evex().vbroadcasti32x2(zmm_a, m);
  cc.evex().vbroadcasti32x4(ymm_a, m);
  cc.evex().vbroadcasti32x4(zmm_a, m);
  cc.evex().vbroadcasti32x8(zmm_a, m);
  cc.evex().vbroadcasti64x2(ymm_a, m);
  cc.evex().vbroadcasti64x2(zmm_a, m);
  cc.evex().vbroadcasti64x4(zmm_a, m);
  cc.evex().vbroadcastsd(ymm_a, m);
  cc.evex().vbroadcastsd(zmm_a, m);
  cc.evex().vbroadcastss(xmm_a, m);
  cc.evex().vbroadcastss(ymm_a, m);
  cc.evex().vbroadcastss(zmm_a, m);
  cc.evex().vcmppd(kA, xmm_b, m, 0);
  cc.evex().vcmppd(kA, ymm_b, m, 0);
  cc.evex().vcmppd(kA, zmm_b, m, 0);
  cc.evex().vcmpps(kA, xmm_b, m, 0);
  cc.evex().vcmpps(kA, ymm_b, m, 0);
  cc.evex().vcmpps(kA, zmm_b, m, 0);
  cc.evex().vcmpsd(kA, xmm_b, m, 0);
  cc.evex().vcmpss(kA, xmm_b, m, 0);
  cc.evex().vcomisd(xmm_a, m);
  cc.evex().vcomiss(xmm_a, m);
  cc.evex().vcompresspd(m, xmm_b);
  cc.evex().vcompresspd(m, ymm_b);
  cc.evex().vcompresspd(m, zmm_b);
  cc.evex().vcompressps(m, xmm_b);
  cc.evex().vcompressps(m, ymm_b);
  cc.evex().vcompressps(m, zmm_b);
  cc.evex().vcvtdq2pd(xmm_a, m);
  cc.evex().vcvtdq2pd(ymm_a, m);
  cc.evex().vcvtdq2pd(zmm_a, m);
  cc.evex().vcvtdq2ps(xmm_a, m);
  cc.evex().vcvtdq2ps(ymm_a, m);
  cc.evex().vcvtdq2ps(zmm_a, m);
  cc.evex().vcvtpd2dq(xmm_a, m128);
  cc.evex().vcvtpd2dq(xmm_a, m256);
  cc.evex().vcvtpd2dq(ymm_a, m512);
  cc.evex().vcvtpd2qq(xmm_a, m);
  cc.evex().vcvtpd2qq(ymm_a, m);
  cc.evex().vcvtpd2qq(zmm_a, m);
  cc.evex().vcvtpd2udq(xmm_a, m128);
  cc.evex().vcvtpd2udq(xmm_a, m256);
  cc.evex().vcvtpd2udq(ymm_a, m512);
  cc.evex().vcvtpd2uqq(xmm_a, m);
  cc.evex().vcvtpd2uqq(ymm_a, m);
  cc.evex().vcvtpd2uqq(zmm_a, m);
  cc.evex().vcvtph2ps(xmm_a, m);
  cc.evex().vcvtph2ps(ymm_a, m);
  cc.evex().vcvtph2ps(zmm_a, m);
  cc.evex().vcvtps2dq(xmm_a, m);
  cc.evex().vcvtps2dq(ymm_a, m);
  cc.evex().vcvtps2dq(zmm_a, m);
  cc.evex().vcvtps2pd(xmm_a, m);
  cc.evex().vcvtps2pd(ymm_a, m);
  cc.evex().vcvtps2pd(zmm_a, m);
  cc.evex().vcvtps2ph(m, xmm_b, 0);
  cc.evex().vcvtps2ph(m, ymm_b, 0);
  cc.evex().vcvtps2ph(m, zmm_b, 0);
  cc.evex().vcvtps2qq(xmm_a, m);
  cc.evex().vcvtps2qq(ymm_a, m);
  cc.evex().vcvtps2qq(zmm_a, m);
  cc.evex().vcvtps2udq(xmm_a, m);
  cc.evex().vcvtps2udq(ymm_a, m);
  cc.evex().vcvtps2udq(zmm_a, m);
  cc.evex().vcvtps2uqq(xmm_a, m);
  cc.evex().vcvtps2uqq(ymm_a, m);
  cc.evex().vcvtps2uqq(zmm_a, m);
  cc.evex().vcvtqq2pd(xmm_a, m);
  cc.evex().vcvtqq2pd(ymm_a, m);
  cc.evex().vcvtqq2pd(zmm_a, m);
  cc.evex().vcvtqq2ps(xmm_a, m128);
  cc.evex().vcvtqq2ps(xmm_a, m256);
  cc.evex().vcvtqq2ps(ymm_a, m512);
  cc.evex().vcvtsd2si(gpd, m);
  cc.evex().vcvtsd2si(gpz, m);
  cc.evex().vcvtsd2ss(xmm_a, xmm_b, m);
  cc.evex().vcvtsd2usi(gpd, m);
  cc.evex().vcvtsd2usi(gpz, m);
  cc.evex().vcvtsi2sd(xmm_a, xmm_b, m32);
  if (cc.is_64bit()) cc.evex().vcvtsi2sd(xmm_a, xmm_b, m64);
  cc.evex().vcvtsi2ss(xmm_a, xmm_b, m32);
  if (cc.is_64bit()) cc.evex().vcvtsi2ss(xmm_a, xmm_b, m64);
  cc.evex().vcvtss2sd(xmm_a, xmm_b, m);
  cc.evex().vcvtss2si(gpd, m);
  cc.evex().vcvtss2si(gpz, m);
  cc.evex().vcvtss2usi(gpd, m);
  cc.evex().vcvtss2usi(gpz, m);
  cc.evex().vcvttpd2dq(xmm_a, m128);
  cc.evex().vcvttpd2dq(xmm_a, m256);
  cc.evex().vcvttpd2dq(ymm_a, m512);
  cc.evex().vcvttpd2qq(xmm_a, m);
  cc.evex().vcvttpd2qq(ymm_a, m);
  cc.evex().vcvttpd2qq(zmm_a, m);
  cc.evex().vcvttpd2udq(xmm_a, m128);
  cc.evex().vcvttpd2udq(xmm_a, m256);
  cc.evex().vcvttpd2udq(ymm_a, m512);
  cc.evex().vcvttpd2uqq(xmm_a, m);
  cc.evex().vcvttpd2uqq(ymm_a, m);
  cc.evex().vcvttpd2uqq(zmm_a, m);
  cc.evex().vcvttps2dq(xmm_a, m);
  cc.evex().vcvttps2dq(ymm_a, m);
  cc.evex().vcvttps2dq(zmm_a, m);
  cc.evex().vcvttps2qq(xmm_a, m);
  cc.evex().vcvttps2qq(ymm_a, m);
  cc.evex().vcvttps2qq(zmm_a, m);
  cc.evex().vcvttps2udq(xmm_a, m);
  cc.evex().vcvttps2udq(ymm_a, m);
  cc.evex().vcvttps2udq(zmm_a, m);
  cc.evex().vcvttps2uqq(xmm_a, m);
  cc.evex().vcvttps2uqq(ymm_a, m);
  cc.evex().vcvttps2uqq(zmm_a, m);
  cc.evex().vcvttsd2si(gpd, m);
  cc.evex().vcvttsd2si(gpz, m);
  cc.evex().vcvttsd2usi(gpd, m);
  cc.evex().vcvttsd2usi(gpz, m);
  cc.evex().vcvttss2si(gpd, m);
  cc.evex().vcvttss2si(gpz, m);
  cc.evex().vcvttss2usi(gpd, m);
  cc.evex().vcvttss2usi(gpz, m);
  cc.evex().vcvtudq2pd(xmm_a, m);
  cc.evex().vcvtudq2pd(ymm_a, m);
  cc.evex().vcvtudq2pd(zmm_a, m);
  cc.evex().vcvtudq2ps(xmm_a, m);
  cc.evex().vcvtudq2ps(ymm_a, m);
  cc.evex().vcvtudq2ps(zmm_a, m);
  cc.evex().vcvtuqq2pd(xmm_a, m);
  cc.evex().vcvtuqq2pd(ymm_a, m);
  cc.evex().vcvtuqq2pd(zmm_a, m);
  cc.evex().vcvtuqq2ps(xmm_a, m128);
  cc.evex().vcvtuqq2ps(xmm_a, m256);
  cc.evex().vcvtuqq2ps(ymm_a, m512);
  cc.evex().vcvtusi2sd(xmm_a, xmm_b, m32);
  if (cc.is_64bit()) cc.evex().vcvtusi2sd(xmm_a, xmm_b, m64);
  cc.evex().vcvtusi2ss(xmm_a, xmm_b, m32);
  if (cc.is_64bit()) cc.evex().vcvtusi2ss(xmm_a, xmm_b, m64);
  cc.evex().vdbpsadbw(xmm_a, xmm_b, m, 0);
  cc.evex().vdbpsadbw(ymm_a, ymm_b, m, 0);
  cc.evex().vdbpsadbw(zmm_a, zmm_b, m, 0);
  cc.evex().vdivpd(xmm_a, xmm_b, m);
  cc.evex().vdivpd(ymm_a, ymm_b, m);
  cc.evex().vdivpd(zmm_a, zmm_b, m);
  cc.evex().vdivps(xmm_a, xmm_b, m);
  cc.evex().vdivps(ymm_a, ymm_b, m);
  cc.evex().vdivps(zmm_a, zmm_b, m);
  cc.evex().vdivsd(xmm_a, xmm_b, m);
  cc.evex().vdivss(xmm_a, xmm_b, m);
  cc.evex().vexpandpd(xmm_a, m);
  cc.evex().vexpandpd(ymm_a, m);
  cc.evex().vexpandpd(zmm_a, m);
  cc.evex().vexpandps(xmm_a, m);
  cc.evex().vexpandps(ymm_a, m);
  cc.evex().vexpandps(zmm_a, m);
  cc.evex().vextractf32x4(m, ymm_b, 0);
  cc.evex().vextractf32x4(m, zmm_b, 0);
  cc.evex().vextractf32x8(m, zmm_b, 0);
  cc.evex().vextractf64x2(m, ymm_b, 0);
  cc.evex().vextractf64x2(m, zmm_b, 0);
  cc.evex().vextractf64x4(m, zmm_b, 0);
  cc.evex().vextracti32x4(m, ymm_b, 0);
  cc.evex().vextracti32x4(m, zmm_b, 0);
  cc.evex().vextracti32x8(m, zmm_b, 0);
  cc.evex().vextracti64x2(m, ymm_b, 0);
  cc.evex().vextracti64x2(m, zmm_b, 0);
  cc.evex().vextracti64x4(m, zmm_b, 0);
  cc.evex().vextractps(m, xmm_b, 0);
  cc.evex().vfixupimmpd(xmm_a, xmm_b, m, 0);
  cc.evex().vfixupimmpd(ymm_a, ymm_b, m, 0);
  cc.evex().vfixupimmpd(zmm_a, zmm_b, m, 0);
  cc.evex().vfixupimmps(xmm_a, xmm_b, m, 0);
  cc.evex().vfixupimmps(ymm_a, ymm_b, m, 0);
  cc.evex().vfixupimmps(zmm_a, zmm_b, m, 0);
  cc.evex().vfixupimmsd(xmm_a, xmm_b, m, 0);
  cc.evex().vfixupimmss(xmm_a, xmm_b, m, 0);
  cc.evex().vfmadd132pd(xmm_a, xmm_b, m);
  cc.evex().vfmadd132pd(ymm_a, ymm_b, m);
  cc.evex().vfmadd132pd(zmm_a, zmm_b, m);
  cc.evex().vfmadd132ps(xmm_a, xmm_b, m);
  cc.evex().vfmadd132ps(ymm_a, ymm_b, m);
  cc.evex().vfmadd132ps(zmm_a, zmm_b, m);
  cc.evex().vfmadd132sd(xmm_a, xmm_b, m);
  cc.evex().vfmadd132ss(xmm_a, xmm_b, m);
  cc.evex().vfmadd213pd(xmm_a, xmm_b, m);
  cc.evex().vfmadd213pd(ymm_a, ymm_b, m);
  cc.evex().vfmadd213pd(zmm_a, zmm_b, m);
  cc.evex().vfmadd213ps(xmm_a, xmm_b, m);
  cc.evex().vfmadd213ps(ymm_a, ymm_b, m);
  cc.evex().vfmadd213ps(zmm_a, zmm_b, m);
  cc.evex().vfmadd213sd(xmm_a, xmm_b, m);
  cc.evex().vfmadd213ss(xmm_a, xmm_b, m);
  cc.evex().vfmadd231pd(xmm_a, xmm_b, m);
  cc.evex().vfmadd231pd(ymm_a, ymm_b, m);
  cc.evex().vfmadd231pd(zmm_a, zmm_b, m);
  cc.evex().vfmadd231ps(xmm_a, xmm_b, m);
  cc.evex().vfmadd231ps(ymm_a, ymm_b, m);
  cc.evex().vfmadd231ps(zmm_a, zmm_b, m);
  cc.evex().vfmadd231sd(xmm_a, xmm_b, m);
  cc.evex().vfmadd231ss(xmm_a, xmm_b, m);
  cc.evex().vfmaddsub132pd(xmm_a, xmm_b, m);
  cc.evex().vfmaddsub132pd(ymm_a, ymm_b, m);
  cc.evex().vfmaddsub132pd(zmm_a, zmm_b, m);
  cc.evex().vfmaddsub132ps(xmm_a, xmm_b, m);
  cc.evex().vfmaddsub132ps(ymm_a, ymm_b, m);
  cc.evex().vfmaddsub132ps(zmm_a, zmm_b, m);
  cc.evex().vfmaddsub213pd(xmm_a, xmm_b, m);
  cc.evex().vfmaddsub213pd(ymm_a, ymm_b, m);
  cc.evex().vfmaddsub213pd(zmm_a, zmm_b, m);
  cc.evex().vfmaddsub213ps(xmm_a, xmm_b, m);
  cc.evex().vfmaddsub213ps(ymm_a, ymm_b, m);
  cc.evex().vfmaddsub213ps(zmm_a, zmm_b, m);
  cc.evex().vfmaddsub231pd(xmm_a, xmm_b, m);
  cc.evex().vfmaddsub231pd(ymm_a, ymm_b, m);
  cc.evex().vfmaddsub231pd(zmm_a, zmm_b, m);
  cc.evex().vfmaddsub231ps(xmm_a, xmm_b, m);
  cc.evex().vfmaddsub231ps(ymm_a, ymm_b, m);
  cc.evex().vfmaddsub231ps(zmm_a, zmm_b, m);
  cc.evex().vfmsub132pd(xmm_a, xmm_b, m);
  cc.evex().vfmsub132pd(ymm_a, ymm_b, m);
  cc.evex().vfmsub132pd(zmm_a, zmm_b, m);
  cc.evex().vfmsub132ps(xmm_a, xmm_b, m);
  cc.evex().vfmsub132ps(ymm_a, ymm_b, m);
  cc.evex().vfmsub132ps(zmm_a, zmm_b, m);
  cc.evex().vfmsub132sd(xmm_a, xmm_b, m);
  cc.evex().vfmsub132ss(xmm_a, xmm_b, m);
  cc.evex().vfmsub213pd(xmm_a, xmm_b, m);
  cc.evex().vfmsub213pd(ymm_a, ymm_b, m);
  cc.evex().vfmsub213pd(zmm_a, zmm_b, m);
  cc.evex().vfmsub213ps(xmm_a, xmm_b, m);
  cc.evex().vfmsub213ps(ymm_a, ymm_b, m);
  cc.evex().vfmsub213ps(zmm_a, zmm_b, m);
  cc.evex().vfmsub213sd(xmm_a, xmm_b, m);
  cc.evex().vfmsub213ss(xmm_a, xmm_b, m);
  cc.evex().vfmsub231pd(xmm_a, xmm_b, m);
  cc.evex().vfmsub231pd(ymm_a, ymm_b, m);
  cc.evex().vfmsub231pd(zmm_a, zmm_b, m);
  cc.evex().vfmsub231ps(xmm_a, xmm_b, m);
  cc.evex().vfmsub231ps(ymm_a, ymm_b, m);
  cc.evex().vfmsub231ps(zmm_a, zmm_b, m);
  cc.evex().vfmsub231sd(xmm_a, xmm_b, m);
  cc.evex().vfmsub231ss(xmm_a, xmm_b, m);
  cc.evex().vfmsubadd132pd(xmm_a, xmm_b, m);
  cc.evex().vfmsubadd132pd(ymm_a, ymm_b, m);
  cc.evex().vfmsubadd132pd(zmm_a, zmm_b, m);
  cc.evex().vfmsubadd132ps(xmm_a, xmm_b, m);
  cc.evex().vfmsubadd132ps(ymm_a, ymm_b, m);
  cc.evex().vfmsubadd132ps(zmm_a, zmm_b, m);
  cc.evex().vfmsubadd213pd(xmm_a, xmm_b, m);
  cc.evex().vfmsubadd213pd(ymm_a, ymm_b, m);
  cc.evex().vfmsubadd213pd(zmm_a, zmm_b, m);
  cc.evex().vfmsubadd213ps(xmm_a, xmm_b, m);
  cc.evex().vfmsubadd213ps(ymm_a, ymm_b, m);
  cc.evex().vfmsubadd213ps(zmm_a, zmm_b, m);
  cc.evex().vfmsubadd231pd(xmm_a, xmm_b, m);
  cc.evex().vfmsubadd231pd(ymm_a, ymm_b, m);
  cc.evex().vfmsubadd231pd(zmm_a, zmm_b, m);
  cc.evex().vfmsubadd231ps(xmm_a, xmm_b, m);
  cc.evex().vfmsubadd231ps(ymm_a, ymm_b, m);
  cc.evex().vfmsubadd231ps(zmm_a, zmm_b, m);
  cc.evex().vfnmadd132pd(xmm_a, xmm_b, m);
  cc.evex().vfnmadd132pd(ymm_a, ymm_b, m);
  cc.evex().vfnmadd132pd(zmm_a, zmm_b, m);
  cc.evex().vfnmadd132ps(xmm_a, xmm_b, m);
  cc.evex().vfnmadd132ps(ymm_a, ymm_b, m);
  cc.evex().vfnmadd132ps(zmm_a, zmm_b, m);
  cc.evex().vfnmadd132sd(xmm_a, xmm_b, m);
  cc.evex().vfnmadd132ss(xmm_a, xmm_b, m);
  cc.evex().vfnmadd213pd(xmm_a, xmm_b, m);
  cc.evex().vfnmadd213pd(ymm_a, ymm_b, m);
  cc.evex().vfnmadd213pd(zmm_a, zmm_b, m);
  cc.evex().vfnmadd213ps(xmm_a, xmm_b, m);
  cc.evex().vfnmadd213ps(ymm_a, ymm_b, m);
  cc.evex().vfnmadd213ps(zmm_a, zmm_b, m);
  cc.evex().vfnmadd213sd(xmm_a, xmm_b, m);
  cc.evex().vfnmadd213ss(xmm_a, xmm_b, m);
  cc.evex().vfnmadd231pd(xmm_a, xmm_b, m);
  cc.evex().vfnmadd231pd(ymm_a, ymm_b, m);
  cc.evex().vfnmadd231pd(zmm_a, zmm_b, m);
  cc.evex().vfnmadd231ps(xmm_a, xmm_b, m);
  cc.evex().vfnmadd231ps(ymm_a, ymm_b, m);
  cc.evex().vfnmadd231ps(zmm_a, zmm_b, m);
  cc.evex().vfnmadd231sd(xmm_a, xmm_b, m);
  cc.evex().vfnmadd231ss(xmm_a, xmm_b, m);
  cc.evex().vfnmsub132pd(xmm_a, xmm_b, m);
  cc.evex().vfnmsub132pd(ymm_a, ymm_b, m);
  cc.evex().vfnmsub132pd(zmm_a, zmm_b, m);
  cc.evex().vfnmsub132ps(xmm_a, xmm_b, m);
  cc.evex().vfnmsub132ps(ymm_a, ymm_b, m);
  cc.evex().vfnmsub132ps(zmm_a, zmm_b, m);
  cc.evex().vfnmsub132sd(xmm_a, xmm_b, m);
  cc.evex().vfnmsub132ss(xmm_a, xmm_b, m);
  cc.evex().vfnmsub213pd(xmm_a, xmm_b, m);
  cc.evex().vfnmsub213pd(ymm_a, ymm_b, m);
  cc.evex().vfnmsub213pd(zmm_a, zmm_b, m);
  cc.evex().vfnmsub213ps(xmm_a, xmm_b, m);
  cc.evex().vfnmsub213ps(ymm_a, ymm_b, m);
  cc.evex().vfnmsub213ps(zmm_a, zmm_b, m);
  cc.evex().vfnmsub213sd(xmm_a, xmm_b, m);
  cc.evex().vfnmsub213ss(xmm_a, xmm_b, m);
  cc.evex().vfnmsub231pd(xmm_a, xmm_b, m);
  cc.evex().vfnmsub231pd(ymm_a, ymm_b, m);
  cc.evex().vfnmsub231pd(zmm_a, zmm_b, m);
  cc.evex().vfnmsub231ps(xmm_a, xmm_b, m);
  cc.evex().vfnmsub231ps(ymm_a, ymm_b, m);
  cc.evex().vfnmsub231ps(zmm_a, zmm_b, m);
  cc.evex().vfnmsub231sd(xmm_a, xmm_b, m);
  cc.evex().vfnmsub231ss(xmm_a, xmm_b, m);
  cc.evex().vfpclasspd(kA, m128, 0);
  cc.evex().vfpclasspd(kA, m256, 0);
  cc.evex().vfpclasspd(kA, m512, 0);
  cc.evex().vfpclassps(kA, m128, 0);
  cc.evex().vfpclassps(kA, m256, 0);
  cc.evex().vfpclassps(kA, m512, 0);
  cc.evex().vfpclasssd(kA, m, 0);
  cc.evex().vfpclassss(kA, m, 0);
  cc.evex().k(kA).vgatherdpd(xmm_a, vx_ptr);
  cc.evex().k(kA).vgatherdpd(ymm_a, vx_ptr);
  cc.evex().k(kA).vgatherdpd(zmm_a, vy_ptr);
  cc.evex().k(kA).vgatherdps(xmm_a, vx_ptr);
  cc.evex().k(kA).vgatherdps(ymm_a, vy_ptr);
  cc.evex().k(kA).vgatherdps(zmm_a, vz_ptr);
  cc.evex().k(kA).vgatherqpd(xmm_a, vx_ptr);
  cc.evex().k(kA).vgatherqpd(ymm_a, vy_ptr);
  cc.evex().k(kA).vgatherqpd(zmm_a, vz_ptr);
  cc.evex().k(kA).vgatherqps(xmm_a, vx_ptr);
  cc.evex().k(kA).vgatherqps(xmm_a, vy_ptr);
  cc.evex().k(kA).vgatherqps(ymm_a, vz_ptr);
  cc.evex().vgetexppd(xmm_a, m);
  cc.evex().vgetexppd(ymm_a, m);
  cc.evex().vgetexppd(zmm_a, m);
  cc.evex().vgetexpps(xmm_a, m);
  cc.evex().vgetexpps(ymm_a, m);
  cc.evex().vgetexpps(zmm_a, m);
  cc.evex().vgetexpsd(xmm_a, xmm_b, m);
  cc.evex().vgetexpss(xmm_a, xmm_b, m);
  cc.evex().vgetmantpd(xmm_a, m, 0);
  cc.evex().vgetmantpd(ymm_a, m, 0);
  cc.evex().vgetmantpd(zmm_a, m, 0);
  cc.evex().vgetmantps(xmm_a, m, 0);
  cc.evex().vgetmantps(ymm_a, m, 0);
  cc.evex().vgetmantps(zmm_a, m, 0);
  cc.evex().vgetmantsd(xmm_a, xmm_b, m, 0);
  cc.evex().vgetmantss(xmm_a, xmm_b, m, 0);
  cc.evex().vinsertf32x4(ymm_a, ymm_b, m, 0);
  cc.evex().vinsertf32x4(zmm_a, zmm_b, m, 0);
  cc.evex().vinsertf32x8(zmm_a, zmm_b, m, 0);
  cc.evex().vinsertf64x2(ymm_a, ymm_b, m, 0);
  cc.evex().vinsertf64x2(zmm_a, zmm_b, m, 0);
  cc.evex().vinsertf64x4(zmm_a, zmm_b, m, 0);
  cc.evex().vinserti32x4(ymm_a, ymm_b, m, 0);
  cc.evex().vinserti32x4(zmm_a, zmm_b, m, 0);
  cc.evex().vinserti32x8(zmm_a, zmm_b, m, 0);
  cc.evex().vinserti64x2(ymm_a, ymm_b, m, 0);
  cc.evex().vinserti64x2(zmm_a, zmm_b, m, 0);
  cc.evex().vinserti64x4(zmm_a, zmm_b, m, 0);
  cc.evex().vinsertps(xmm_a, xmm_b, m, 0);
  cc.evex().vmaxpd(xmm_a, xmm_b, m);
  cc.evex().vmaxpd(ymm_a, ymm_b, m);
  cc.evex().vmaxpd(zmm_a, zmm_b, m);
  cc.evex().vmaxps(xmm_a, xmm_b, m);
  cc.evex().vmaxps(ymm_a, ymm_b, m);
  cc.evex().vmaxps(zmm_a, zmm_b, m);
  cc.evex().vmaxsd(xmm_a, xmm_b, m);
  cc.evex().vmaxss(xmm_a, xmm_b, m);
  cc.evex().vminpd(xmm_a, xmm_b, m);
  cc.evex().vminpd(ymm_a, ymm_b, m);
  cc.evex().vminpd(zmm_a, zmm_b, m);
  cc.evex().vminps(xmm_a, xmm_b, m);
  cc.evex().vminps(ymm_a, ymm_b, m);
  cc.evex().vminps(zmm_a, zmm_b, m);
  cc.evex().vminsd(xmm_a, xmm_b, m);
  cc.evex().vminss(xmm_a, xmm_b, m);
  cc.evex().vmovapd(xmm_a, m);
  cc.evex().vmovapd(m, xmm_b);
  cc.evex().vmovapd(ymm_a, m);
  cc.evex().vmovapd(m, ymm_b);
  cc.evex().vmovapd(zmm_a, m);
  cc.evex().vmovapd(m, zmm_b);
  cc.evex().vmovaps(xmm_a, m);
  cc.evex().vmovaps(m, xmm_b);
  cc.evex().vmovaps(ymm_a, m);
  cc.evex().vmovaps(m, ymm_b);
  cc.evex().vmovaps(zmm_a, m);
  cc.evex().vmovaps(m, zmm_b);
  cc.evex().vmovd(m, xmm_b);
  cc.evex().vmovd(xmm_a, m);
  cc.evex().vmovddup(xmm_a, m);
  cc.evex().vmovddup(ymm_a, m);
  cc.evex().vmovddup(zmm_a, m);
  cc.evex().vmovdqa32(xmm_a, m);
  cc.evex().vmovdqa32(m, xmm_b);
  cc.evex().vmovdqa32(ymm_a, m);
  cc.evex().vmovdqa32(m, ymm_b);
  cc.evex().vmovdqa32(zmm_a, m);
  cc.evex().vmovdqa32(m, zmm_b);
  cc.evex().vmovdqa64(xmm_a, m);
  cc.evex().vmovdqa64(m, xmm_b);
  cc.evex().vmovdqa64(ymm_a, m);
  cc.evex().vmovdqa64(m, ymm_b);
  cc.evex().vmovdqa64(zmm_a, m);
  cc.evex().vmovdqa64(m, zmm_b);
  cc.evex().vmovdqu16(xmm_a, m);
  cc.evex().vmovdqu16(m, xmm_b);
  cc.evex().vmovdqu16(ymm_a, m);
  cc.evex().vmovdqu16(m, ymm_b);
  cc.evex().vmovdqu16(zmm_a, m);
  cc.evex().vmovdqu16(m, zmm_b);
  cc.evex().vmovdqu32(xmm_a, m);
  cc.evex().vmovdqu32(m, xmm_b);
  cc.evex().vmovdqu32(ymm_a, m);
  cc.evex().vmovdqu32(m, ymm_b);
  cc.evex().vmovdqu32(zmm_a, m);
  cc.evex().vmovdqu32(m, zmm_b);
  cc.evex().vmovdqu64(xmm_a, m);
  cc.evex().vmovdqu64(m, xmm_b);
  cc.evex().vmovdqu64(ymm_a, m);
  cc.evex().vmovdqu64(m, ymm_b);
  cc.evex().vmovdqu64(zmm_a, m);
  cc.evex().vmovdqu64(m, zmm_b);
  cc.evex().vmovdqu8(xmm_a, m);
  cc.evex().vmovdqu8(m, xmm_b);
  cc.evex().vmovdqu8(ymm_a, m);
  cc.evex().vmovdqu8(m, ymm_b);
  cc.evex().vmovdqu8(zmm_a, m);
  cc.evex().vmovdqu8(m, zmm_b);
  cc.evex().vmovhpd(m, xmm_b);
  cc.evex().vmovhpd(xmm_a, xmm_b, m);
  cc.evex().vmovhps(m, xmm_b);
  cc.evex().vmovhps(xmm_a, xmm_b, m);
  cc.evex().vmovlpd(m, xmm_b);
  cc.evex().vmovlpd(xmm_a, xmm_b, m);
  cc.evex().vmovlps(m, xmm_b);
  cc.evex().vmovlps(xmm_a, xmm_b, m);
  cc.evex().vmovntdq(m, xmm_b);
  cc.evex().vmovntdq(m, ymm_b);
  cc.evex().vmovntdq(m, zmm_b);
  cc.evex().vmovntdqa(xmm_a, m);
  cc.evex().vmovntdqa(ymm_a, m);
  cc.evex().vmovntdqa(zmm_a, m);
  cc.evex().vmovntpd(m, xmm_b);
  cc.evex().vmovntpd(m, ymm_b);
  cc.evex().vmovntpd(m, zmm_b);
  cc.evex().vmovntps(m, xmm_b);
  cc.evex().vmovntps(m, ymm_b);
  cc.evex().vmovntps(m, zmm_b);
  cc.evex().vmovq(m, xmm_b);
  cc.evex().vmovq(xmm_a, m);
  cc.evex().vmovq(xmm_a, m);
  cc.evex().vmovq(m, xmm_b);
  cc.evex().vmovsd(m, xmm_b);
  cc.evex().vmovsd(xmm_a, m);
  cc.evex().vmovshdup(xmm_a, m);
  cc.evex().vmovshdup(ymm_a, m);
  cc.evex().vmovshdup(zmm_a, m);
  cc.evex().vmovsldup(xmm_a, m);
  cc.evex().vmovsldup(ymm_a, m);
  cc.evex().vmovsldup(zmm_a, m);
  cc.evex().vmovss(m, xmm_b);
  cc.evex().vmovss(xmm_a, m);
  cc.evex().vmovupd(xmm_a, m);
  cc.evex().vmovupd(m, xmm_b);
  cc.evex().vmovupd(ymm_a, m);
  cc.evex().vmovupd(m, ymm_b);
  cc.evex().vmovupd(zmm_a, m);
  cc.evex().vmovupd(m, zmm_b);
  cc.evex().vmovups(xmm_a, m);
  cc.evex().vmovups(m, xmm_b);
  cc.evex().vmovups(ymm_a, m);
  cc.evex().vmovups(m, ymm_b);
  cc.evex().vmovups(zmm_a, m);
  cc.evex().vmovups(m, zmm_b);
  cc.evex().vmulpd(xmm_a, xmm_b, m);
  cc.evex().vmulpd(ymm_a, ymm_b, m);
  cc.evex().vmulpd(zmm_a, zmm_b, m);
  cc.evex().vmulps(xmm_a, xmm_b, m);
  cc.evex().vmulps(ymm_a, ymm_b, m);
  cc.evex().vmulps(zmm_a, zmm_b, m);
  cc.evex().vmulsd(xmm_a, xmm_b, m);
  cc.evex().vmulss(xmm_a, xmm_b, m);
  cc.evex().vorpd(xmm_a, xmm_b, m);
  cc.evex().vorpd(ymm_a, ymm_b, m);
  cc.evex().vorpd(zmm_a, zmm_b, m);
  cc.evex().vorps(xmm_a, xmm_b, m);
  cc.evex().vorps(ymm_a, ymm_b, m);
  cc.evex().vorps(zmm_a, zmm_b, m);
  cc.evex().vpabsb(xmm_a, m);
  cc.evex().vpabsb(ymm_a, m);
  cc.evex().vpabsb(zmm_a, m);
  cc.evex().vpabsd(xmm_a, m);
  cc.evex().vpabsd(ymm_a, m);
  cc.evex().vpabsd(zmm_a, m);
  cc.evex().vpabsq(xmm_a, m);
  cc.evex().vpabsq(ymm_a, m);
  cc.evex().vpabsq(zmm_a, m);
  cc.evex().vpabsw(xmm_a, m);
  cc.evex().vpabsw(ymm_a, m);
  cc.evex().vpabsw(zmm_a, m);
  cc.evex().vpackssdw(xmm_a, xmm_b, m);
  cc.evex().vpackssdw(ymm_a, ymm_b, m);
  cc.evex().vpackssdw(zmm_a, zmm_b, m);
  cc.evex().vpacksswb(xmm_a, xmm_b, m);
  cc.evex().vpacksswb(ymm_a, ymm_b, m);
  cc.evex().vpacksswb(zmm_a, zmm_b, m);
  cc.evex().vpackusdw(xmm_a, xmm_b, m);
  cc.evex().vpackusdw(ymm_a, ymm_b, m);
  cc.evex().vpackusdw(zmm_a, zmm_b, m);
  cc.evex().vpackuswb(xmm_a, xmm_b, m);
  cc.evex().vpackuswb(ymm_a, ymm_b, m);
  cc.evex().vpackuswb(zmm_a, zmm_b, m);
  cc.evex().vpaddb(xmm_a, xmm_b, m);
  cc.evex().vpaddb(ymm_a, ymm_b, m);
  cc.evex().vpaddb(zmm_a, zmm_b, m);
  cc.evex().vpaddd(xmm_a, xmm_b, m);
  cc.evex().vpaddd(ymm_a, ymm_b, m);
  cc.evex().vpaddd(zmm_a, zmm_b, m);
  cc.evex().vpaddq(xmm_a, xmm_b, m);
  cc.evex().vpaddq(ymm_a, ymm_b, m);
  cc.evex().vpaddq(zmm_a, zmm_b, m);
  cc.evex().vpaddsb(xmm_a, xmm_b, m);
  cc.evex().vpaddsb(ymm_a, ymm_b, m);
  cc.evex().vpaddsb(zmm_a, zmm_b, m);
  cc.evex().vpaddsw(xmm_a, xmm_b, m);
  cc.evex().vpaddsw(ymm_a, ymm_b, m);
  cc.evex().vpaddsw(zmm_a, zmm_b, m);
  cc.evex().vpaddusb(xmm_a, xmm_b, m);
  cc.evex().vpaddusb(ymm_a, ymm_b, m);
  cc.evex().vpaddusb(zmm_a, zmm_b, m);
  cc.evex().vpaddusw(xmm_a, xmm_b, m);
  cc.evex().vpaddusw(ymm_a, ymm_b, m);
  cc.evex().vpaddusw(zmm_a, zmm_b, m);
  cc.evex().vpaddw(xmm_a, xmm_b, m);
  cc.evex().vpaddw(ymm_a, ymm_b, m);
  cc.evex().vpaddw(zmm_a, zmm_b, m);
  cc.evex().vpalignr(xmm_a, xmm_b, m, 0);
  cc.evex().vpalignr(ymm_a, ymm_b, m, 0);
  cc.evex().vpalignr(zmm_a, zmm_b, m, 0);
  cc.evex().vpandd(xmm_a, xmm_b, m);
  cc.evex().vpandd(ymm_a, ymm_b, m);
  cc.evex().vpandd(zmm_a, zmm_b, m);
  cc.evex().vpandnd(xmm_a, xmm_b, m);
  cc.evex().vpandnd(ymm_a, ymm_b, m);
  cc.evex().vpandnd(zmm_a, zmm_b, m);
  cc.evex().vpandnq(xmm_a, xmm_b, m);
  cc.evex().vpandnq(ymm_a, ymm_b, m);
  cc.evex().vpandnq(zmm_a, zmm_b, m);
  cc.evex().vpandq(xmm_a, xmm_b, m);
  cc.evex().vpandq(ymm_a, ymm_b, m);
  cc.evex().vpandq(zmm_a, zmm_b, m);
  cc.evex().vpavgb(xmm_a, xmm_b, m);
  cc.evex().vpavgb(ymm_a, ymm_b, m);
  cc.evex().vpavgb(zmm_a, zmm_b, m);
  cc.evex().vpavgw(xmm_a, xmm_b, m);
  cc.evex().vpavgw(ymm_a, ymm_b, m);
  cc.evex().vpavgw(zmm_a, zmm_b, m);
  cc.evex().vpblendmb(xmm_a, xmm_b, m);
  cc.evex().vpblendmb(ymm_a, ymm_b, m);
  cc.evex().vpblendmb(zmm_a, zmm_b, m);
  cc.evex().vpblendmd(xmm_a, xmm_b, m);
  cc.evex().vpblendmd(ymm_a, ymm_b, m);
  cc.evex().vpblendmd(zmm_a, zmm_b, m);
  cc.evex().vpblendmq(xmm_a, xmm_b, m);
  cc.evex().vpblendmq(ymm_a, ymm_b, m);
  cc.evex().vpblendmq(zmm_a, zmm_b, m);
  cc.evex().vpblendmw(xmm_a, xmm_b, m);
  cc.evex().vpblendmw(ymm_a, ymm_b, m);
  cc.evex().vpblendmw(zmm_a, zmm_b, m);
  cc.evex().vpbroadcastb(xmm_a, m);
  cc.evex().vpbroadcastb(ymm_a, m);
  cc.evex().vpbroadcastb(zmm_a, m);
  cc.evex().vpbroadcastd(xmm_a, m);
  cc.evex().vpbroadcastd(ymm_a, m);
  cc.evex().vpbroadcastd(zmm_a, m);
  cc.evex().vpbroadcastq(xmm_a, m);
  cc.evex().vpbroadcastq(ymm_a, m);
  cc.evex().vpbroadcastq(zmm_a, m);
  cc.evex().vpbroadcastw(xmm_a, m);
  cc.evex().vpbroadcastw(ymm_a, m);
  cc.evex().vpbroadcastw(zmm_a, m);
  cc.evex().vpcmpb(kA, xmm_b, m, 0);
  cc.evex().vpcmpb(kA, ymm_b, m, 0);
  cc.evex().vpcmpb(kA, zmm_b, m, 0);
  cc.evex().vpcmpd(kA, xmm_b, m, 0);
  cc.evex().vpcmpd(kA, ymm_b, m, 0);
  cc.evex().vpcmpd(kA, zmm_b, m, 0);
  cc.evex().vpcmpeqb(kA, xmm_b, m);
  cc.evex().vpcmpeqb(kA, ymm_b, m);
  cc.evex().vpcmpeqb(kA, zmm_b, m);
  cc.evex().vpcmpeqd(kA, xmm_b, m);
  cc.evex().vpcmpeqd(kA, ymm_b, m);
  cc.evex().vpcmpeqd(kA, zmm_b, m);
  cc.evex().vpcmpeqq(kA, xmm_b, m);
  cc.evex().vpcmpeqq(kA, ymm_b, m);
  cc.evex().vpcmpeqq(kA, zmm_b, m);
  cc.evex().vpcmpeqw(kA, xmm_b, m);
  cc.evex().vpcmpeqw(kA, ymm_b, m);
  cc.evex().vpcmpeqw(kA, zmm_b, m);
  cc.evex().vpcmpgtb(kA, xmm_b, m);
  cc.evex().vpcmpgtb(kA, ymm_b, m);
  cc.evex().vpcmpgtb(kA, zmm_b, m);
  cc.evex().vpcmpgtd(kA, xmm_b, m);
  cc.evex().vpcmpgtd(kA, ymm_b, m);
  cc.evex().vpcmpgtd(kA, zmm_b, m);
  cc.evex().vpcmpgtq(kA, xmm_b, m);
  cc.evex().vpcmpgtq(kA, ymm_b, m);
  cc.evex().vpcmpgtq(kA, zmm_b, m);
  cc.evex().vpcmpgtw(kA, xmm_b, m);
  cc.evex().vpcmpgtw(kA, ymm_b, m);
  cc.evex().vpcmpgtw(kA, zmm_b, m);
  cc.evex().vpcmpq(kA, xmm_b, m, 0);
  cc.evex().vpcmpq(kA, ymm_b, m, 0);
  cc.evex().vpcmpq(kA, zmm_b, m, 0);
  cc.evex().vpcmpub(kA, xmm_b, m, 0);
  cc.evex().vpcmpub(kA, ymm_b, m, 0);
  cc.evex().vpcmpub(kA, zmm_b, m, 0);
  cc.evex().vpcmpud(kA, xmm_b, m, 0);
  cc.evex().vpcmpud(kA, ymm_b, m, 0);
  cc.evex().vpcmpud(kA, zmm_b, m, 0);
  cc.evex().vpcmpuq(kA, xmm_b, m, 0);
  cc.evex().vpcmpuq(kA, ymm_b, m, 0);
  cc.evex().vpcmpuq(kA, zmm_b, m, 0);
  cc.evex().vpcmpuw(kA, xmm_b, m, 0);
  cc.evex().vpcmpuw(kA, ymm_b, m, 0);
  cc.evex().vpcmpuw(kA, zmm_b, m, 0);
  cc.evex().vpcmpw(kA, xmm_b, m, 0);
  cc.evex().vpcmpw(kA, ymm_b, m, 0);
  cc.evex().vpcmpw(kA, zmm_b, m, 0);
  cc.evex().vpcompressd(m, xmm_b);
  cc.evex().vpcompressd(m, ymm_b);
  cc.evex().vpcompressd(m, zmm_b);
  cc.evex().vpcompressq(m, xmm_b);
  cc.evex().vpcompressq(m, ymm_b);
  cc.evex().vpcompressq(m, zmm_b);
  cc.evex().vpconflictd(xmm_a, m);
  cc.evex().vpconflictd(ymm_a, m);
  cc.evex().vpconflictd(zmm_a, m);
  cc.evex().vpconflictq(xmm_a, m);
  cc.evex().vpconflictq(ymm_a, m);
  cc.evex().vpconflictq(zmm_a, m);
  cc.evex().vpermb(xmm_a, xmm_b, m);
  cc.evex().vpermb(ymm_a, ymm_b, m);
  cc.evex().vpermb(zmm_a, zmm_b, m);
  cc.evex().vpermd(ymm_a, ymm_b, m);
  cc.evex().vpermd(zmm_a, zmm_b, m);
  cc.evex().vpermi2b(xmm_a, xmm_b, m);
  cc.evex().vpermi2b(ymm_a, ymm_b, m);
  cc.evex().vpermi2b(zmm_a, zmm_b, m);
  cc.evex().vpermi2d(xmm_a, xmm_b, m);
  cc.evex().vpermi2d(ymm_a, ymm_b, m);
  cc.evex().vpermi2d(zmm_a, zmm_b, m);
  cc.evex().vpermi2pd(xmm_a, xmm_b, m);
  cc.evex().vpermi2pd(ymm_a, ymm_b, m);
  cc.evex().vpermi2pd(zmm_a, zmm_b, m);
  cc.evex().vpermi2ps(xmm_a, xmm_b, m);
  cc.evex().vpermi2ps(ymm_a, ymm_b, m);
  cc.evex().vpermi2ps(zmm_a, zmm_b, m);
  cc.evex().vpermi2q(xmm_a, xmm_b, m);
  cc.evex().vpermi2q(ymm_a, ymm_b, m);
  cc.evex().vpermi2q(zmm_a, zmm_b, m);
  cc.evex().vpermi2w(xmm_a, xmm_b, m);
  cc.evex().vpermi2w(ymm_a, ymm_b, m);
  cc.evex().vpermi2w(zmm_a, zmm_b, m);
  cc.evex().vpermilpd(xmm_a, xmm_b, m);
  cc.evex().vpermilpd(ymm_a, ymm_b, m);
  cc.evex().vpermilpd(zmm_a, zmm_b, m);
  cc.evex().vpermilpd(xmm_a, m, 0);
  cc.evex().vpermilpd(ymm_a, m, 0);
  cc.evex().vpermilpd(zmm_a, m, 0);
  cc.evex().vpermilps(xmm_a, xmm_b, m);
  cc.evex().vpermilps(ymm_a, ymm_b, m);
  cc.evex().vpermilps(zmm_a, zmm_b, m);
  cc.evex().vpermilps(xmm_a, m, 0);
  cc.evex().vpermilps(ymm_a, m, 0);
  cc.evex().vpermilps(zmm_a, m, 0);
  cc.evex().vpermq(ymm_a, ymm_b, m);
  cc.evex().vpermq(zmm_a, zmm_b, m);
  cc.evex().vpermq(ymm_a, m, 0);
  cc.evex().vpermq(zmm_a, m, 0);
  cc.evex().vpermt2b(xmm_a, xmm_b, m);
  cc.evex().vpermt2b(ymm_a, ymm_b, m);
  cc.evex().vpermt2b(zmm_a, zmm_b, m);
  cc.evex().vpermt2d(xmm_a, xmm_b, m);
  cc.evex().vpermt2d(ymm_a, ymm_b, m);
  cc.evex().vpermt2d(zmm_a, zmm_b, m);
  cc.evex().vpermt2pd(xmm_a, xmm_b, m);
  cc.evex().vpermt2pd(ymm_a, ymm_b, m);
  cc.evex().vpermt2pd(zmm_a, zmm_b, m);
  cc.evex().vpermt2ps(xmm_a, xmm_b, m);
  cc.evex().vpermt2ps(ymm_a, ymm_b, m);
  cc.evex().vpermt2ps(zmm_a, zmm_b, m);
  cc.evex().vpermt2q(xmm_a, xmm_b, m);
  cc.evex().vpermt2q(ymm_a, ymm_b, m);
  cc.evex().vpermt2q(zmm_a, zmm_b, m);
  cc.evex().vpermt2w(xmm_a, xmm_b, m);
  cc.evex().vpermt2w(ymm_a, ymm_b, m);
  cc.evex().vpermt2w(zmm_a, zmm_b, m);
  cc.evex().vpermw(xmm_a, xmm_b, m);
  cc.evex().vpermw(ymm_a, ymm_b, m);
  cc.evex().vpermw(zmm_a, zmm_b, m);
  cc.evex().vpexpandd(xmm_a, m);
  cc.evex().vpexpandd(ymm_a, m);
  cc.evex().vpexpandd(zmm_a, m);
  cc.evex().vpexpandq(xmm_a, m);
  cc.evex().vpexpandq(ymm_a, m);
  cc.evex().vpexpandq(zmm_a, m);
  cc.evex().vpextrb(m, xmm_b, 0);
  cc.evex().vpextrd(m, xmm_b, 0);
  if (cc.is_64bit()) cc.evex().vpextrq(m, xmm_b, 0);
  cc.evex().vpextrw(m, xmm_b, 0);
  cc.evex().k(kA).vpgatherdd(xmm_a, vx_ptr);
  cc.evex().k(kA).vpgatherdd(ymm_a, vy_ptr);
  cc.evex().k(kA).vpgatherdd(zmm_a, vz_ptr);
  cc.evex().k(kA).vpgatherdq(xmm_a, vx_ptr);
  cc.evex().k(kA).vpgatherdq(ymm_a, vx_ptr);
  cc.evex().k(kA).vpgatherdq(zmm_a, vy_ptr);
  cc.evex().k(kA).vpgatherqd(xmm_a, vx_ptr);
  cc.evex().k(kA).vpgatherqd(xmm_a, vy_ptr);
  cc.evex().k(kA).vpgatherqd(ymm_a, vz_ptr);
  cc.evex().k(kA).vpgatherqq(xmm_a, vx_ptr);
  cc.evex().k(kA).vpgatherqq(ymm_a, vy_ptr);
  cc.evex().k(kA).vpgatherqq(zmm_a, vz_ptr);
  cc.evex().vpinsrb(xmm_a, xmm_b, m, 0);
  cc.evex().vpinsrd(xmm_a, xmm_b, m, 0);
  if (cc.is_64bit()) cc.evex().vpinsrq(xmm_a, xmm_b, m, 0);
  cc.evex().vpinsrw(xmm_a, xmm_b, m, 0);
  cc.evex().vplzcntd(xmm_a, m);
  cc.evex().vplzcntd(ymm_a, m);
  cc.evex().vplzcntd(zmm_a, m);
  cc.evex().vplzcntq(xmm_a, m);
  cc.evex().vplzcntq(ymm_a, m);
  cc.evex().vplzcntq(zmm_a, m);
  cc.evex().vpmadd52huq(xmm_a, xmm_b, m);
  cc.evex().vpmadd52huq(ymm_a, ymm_b, m);
  cc.evex().vpmadd52huq(zmm_a, zmm_b, m);
  cc.evex().vpmadd52luq(xmm_a, xmm_b, m);
  cc.evex().vpmadd52luq(ymm_a, ymm_b, m);
  cc.evex().vpmadd52luq(zmm_a, zmm_b, m);
  cc.evex().vpmaddubsw(xmm_a, xmm_b, m);
  cc.evex().vpmaddubsw(ymm_a, ymm_b, m);
  cc.evex().vpmaddubsw(zmm_a, zmm_b, m);
  cc.evex().vpmaddwd(xmm_a, xmm_b, m);
  cc.evex().vpmaddwd(ymm_a, ymm_b, m);
  cc.evex().vpmaddwd(zmm_a, zmm_b, m);
  cc.evex().vpmaxsb(xmm_a, xmm_b, m);
  cc.evex().vpmaxsb(ymm_a, ymm_b, m);
  cc.evex().vpmaxsb(zmm_a, zmm_b, m);
  cc.evex().vpmaxsd(xmm_a, xmm_b, m);
  cc.evex().vpmaxsd(ymm_a, ymm_b, m);
  cc.evex().vpmaxsd(zmm_a, zmm_b, m);
  cc.evex().vpmaxsq(xmm_a, xmm_b, m);
  cc.evex().vpmaxsq(ymm_a, ymm_b, m);
  cc.evex().vpmaxsq(zmm_a, zmm_b, m);
  cc.evex().vpmaxsw(xmm_a, xmm_b, m);
  cc.evex().vpmaxsw(ymm_a, ymm_b, m);
  cc.evex().vpmaxsw(zmm_a, zmm_b, m);
  cc.evex().vpmaxub(xmm_a, xmm_b, m);
  cc.evex().vpmaxub(ymm_a, ymm_b, m);
  cc.evex().vpmaxub(zmm_a, zmm_b, m);
  cc.evex().vpmaxud(xmm_a, xmm_b, m);
  cc.evex().vpmaxud(ymm_a, ymm_b, m);
  cc.evex().vpmaxud(zmm_a, zmm_b, m);
  cc.evex().vpmaxuq(xmm_a, xmm_b, m);
  cc.evex().vpmaxuq(ymm_a, ymm_b, m);
  cc.evex().vpmaxuq(zmm_a, zmm_b, m);
  cc.evex().vpmaxuw(xmm_a, xmm_b, m);
  cc.evex().vpmaxuw(ymm_a, ymm_b, m);
  cc.evex().vpmaxuw(zmm_a, zmm_b, m);
  cc.evex().vpminsb(xmm_a, xmm_b, m);
  cc.evex().vpminsb(ymm_a, ymm_b, m);
  cc.evex().vpminsb(zmm_a, zmm_b, m);
  cc.evex().vpminsd(xmm_a, xmm_b, m);
  cc.evex().vpminsd(ymm_a, ymm_b, m);
  cc.evex().vpminsd(zmm_a, zmm_b, m);
  cc.evex().vpminsq(xmm_a, xmm_b, m);
  cc.evex().vpminsq(ymm_a, ymm_b, m);
  cc.evex().vpminsq(zmm_a, zmm_b, m);
  cc.evex().vpminsw(xmm_a, xmm_b, m);
  cc.evex().vpminsw(ymm_a, ymm_b, m);
  cc.evex().vpminsw(zmm_a, zmm_b, m);
  cc.evex().vpminub(xmm_a, xmm_b, m);
  cc.evex().vpminub(ymm_a, ymm_b, m);
  cc.evex().vpminub(zmm_a, zmm_b, m);
  cc.evex().vpminud(xmm_a, xmm_b, m);
  cc.evex().vpminud(ymm_a, ymm_b, m);
  cc.evex().vpminud(zmm_a, zmm_b, m);
  cc.evex().vpminuq(xmm_a, xmm_b, m);
  cc.evex().vpminuq(ymm_a, ymm_b, m);
  cc.evex().vpminuq(zmm_a, zmm_b, m);
  cc.evex().vpminuw(xmm_a, xmm_b, m);
  cc.evex().vpminuw(ymm_a, ymm_b, m);
  cc.evex().vpminuw(zmm_a, zmm_b, m);
  cc.evex().vpmovdb(m, xmm_b);
  cc.evex().vpmovdb(m, ymm_b);
  cc.evex().vpmovdb(m, zmm_b);
  cc.evex().vpmovdw(m, xmm_b);
  cc.evex().vpmovdw(m, ymm_b);
  cc.evex().vpmovdw(m, zmm_b);
  cc.evex().vpmovqb(m, xmm_b);
  cc.evex().vpmovqb(m, ymm_b);
  cc.evex().vpmovqb(m, zmm_b);
  cc.evex().vpmovqd(m, xmm_b);
  cc.evex().vpmovqd(m, ymm_b);
  cc.evex().vpmovqd(m, zmm_b);
  cc.evex().vpmovqw(m, xmm_b);
  cc.evex().vpmovqw(m, ymm_b);
  cc.evex().vpmovqw(m, zmm_b);
  cc.evex().vpmovsdb(m, xmm_b);
  cc.evex().vpmovsdb(m, ymm_b);
  cc.evex().vpmovsdb(m, zmm_b);
  cc.evex().vpmovsdw(m, xmm_b);
  cc.evex().vpmovsdw(m, ymm_b);
  cc.evex().vpmovsdw(m, zmm_b);
  cc.evex().vpmovsqb(m, xmm_b);
  cc.evex().vpmovsqb(m, ymm_b);
  cc.evex().vpmovsqb(m, zmm_b);
  cc.evex().vpmovsqd(m, xmm_b);
  cc.evex().vpmovsqd(m, ymm_b);
  cc.evex().vpmovsqd(m, zmm_b);
  cc.evex().vpmovsqw(m, xmm_b);
  cc.evex().vpmovsqw(m, ymm_b);
  cc.evex().vpmovsqw(m, zmm_b);
  cc.evex().vpmovswb(m, xmm_b);
  cc.evex().vpmovswb(m, ymm_b);
  cc.evex().vpmovswb(m, zmm_b);
  cc.evex().vpmovsxbd(xmm_a, m);
  cc.evex().vpmovsxbd(ymm_a, m);
  cc.evex().vpmovsxbd(zmm_a, m);
  cc.evex().vpmovsxbq(xmm_a, m);
  cc.evex().vpmovsxbq(ymm_a, m);
  cc.evex().vpmovsxbq(zmm_a, m);
  cc.evex().vpmovsxbw(xmm_a, m);
  cc.evex().vpmovsxbw(ymm_a, m);
  cc.evex().vpmovsxbw(zmm_a, m);
  cc.evex().vpmovsxdq(xmm_a, m);
  cc.evex().vpmovsxdq(ymm_a, m);
  cc.evex().vpmovsxdq(zmm_a, m);
  cc.evex().vpmovsxwd(xmm_a, m);
  cc.evex().vpmovsxwd(ymm_a, m);
  cc.evex().vpmovsxwd(zmm_a, m);
  cc.evex().vpmovsxwq(xmm_a, m);
  cc.evex().vpmovsxwq(ymm_a, m);
  cc.evex().vpmovsxwq(zmm_a, m);
  cc.evex().vpmovusdb(m, xmm_b);
  cc.evex().vpmovusdb(m, ymm_b);
  cc.evex().vpmovusdb(m, zmm_b);
  cc.evex().vpmovusdw(m, xmm_b);
  cc.evex().vpmovusdw(m, ymm_b);
  cc.evex().vpmovusdw(m, zmm_b);
  cc.evex().vpmovusqb(m, xmm_b);
  cc.evex().vpmovusqb(m, ymm_b);
  cc.evex().vpmovusqb(m, zmm_b);
  cc.evex().vpmovusqd(m, xmm_b);
  cc.evex().vpmovusqd(m, ymm_b);
  cc.evex().vpmovusqd(m, zmm_b);
  cc.evex().vpmovusqw(m, xmm_b);
  cc.evex().vpmovusqw(m, ymm_b);
  cc.evex().vpmovusqw(m, zmm_b);
  cc.evex().vpmovuswb(m, xmm_b);
  cc.evex().vpmovuswb(m, ymm_b);
  cc.evex().vpmovuswb(m, zmm_b);
  cc.evex().vpmovwb(m, xmm_b);
  cc.evex().vpmovwb(m, ymm_b);
  cc.evex().vpmovwb(m, zmm_b);
  cc.evex().vpmovzxbd(xmm_a, m);
  cc.evex().vpmovzxbd(ymm_a, m);
  cc.evex().vpmovzxbd(zmm_a, m);
  cc.evex().vpmovzxbq(xmm_a, m);
  cc.evex().vpmovzxbq(ymm_a, m);
  cc.evex().vpmovzxbq(zmm_a, m);
  cc.evex().vpmovzxbw(xmm_a, m);
  cc.evex().vpmovzxbw(ymm_a, m);
  cc.evex().vpmovzxbw(zmm_a, m);
  cc.evex().vpmovzxdq(xmm_a, m);
  cc.evex().vpmovzxdq(ymm_a, m);
  cc.evex().vpmovzxdq(zmm_a, m);
  cc.evex().vpmovzxwd(xmm_a, m);
  cc.evex().vpmovzxwd(ymm_a, m);
  cc.evex().vpmovzxwd(zmm_a, m);
  cc.evex().vpmovzxwq(xmm_a, m);
  cc.evex().vpmovzxwq(ymm_a, m);
  cc.evex().vpmovzxwq(zmm_a, m);
  cc.evex().vpmuldq(xmm_a, xmm_b, m);
  cc.evex().vpmuldq(ymm_a, ymm_b, m);
  cc.evex().vpmuldq(zmm_a, zmm_b, m);
  cc.evex().vpmulhrsw(xmm_a, xmm_b, m);
  cc.evex().vpmulhrsw(ymm_a, ymm_b, m);
  cc.evex().vpmulhrsw(zmm_a, zmm_b, m);
  cc.evex().vpmulhuw(xmm_a, xmm_b, m);
  cc.evex().vpmulhuw(ymm_a, ymm_b, m);
  cc.evex().vpmulhuw(zmm_a, zmm_b, m);
  cc.evex().vpmulhw(xmm_a, xmm_b, m);
  cc.evex().vpmulhw(ymm_a, ymm_b, m);
  cc.evex().vpmulhw(zmm_a, zmm_b, m);
  cc.evex().vpmulld(xmm_a, xmm_b, m);
  cc.evex().vpmulld(ymm_a, ymm_b, m);
  cc.evex().vpmulld(zmm_a, zmm_b, m);
  cc.evex().vpmullq(xmm_a, xmm_b, m);
  cc.evex().vpmullq(ymm_a, ymm_b, m);
  cc.evex().vpmullq(zmm_a, zmm_b, m);
  cc.evex().vpmullw(xmm_a, xmm_b, m);
  cc.evex().vpmullw(ymm_a, ymm_b, m);
  cc.evex().vpmullw(zmm_a, zmm_b, m);
  cc.evex().vpmultishiftqb(xmm_a, xmm_b, m);
  cc.evex().vpmultishiftqb(ymm_a, ymm_b, m);
  cc.evex().vpmultishiftqb(zmm_a, zmm_b, m);
  cc.evex().vpmuludq(xmm_a, xmm_b, m);
  cc.evex().vpmuludq(ymm_a, ymm_b, m);
  cc.evex().vpmuludq(zmm_a, zmm_b, m);
  cc.evex().vpopcntd(zmm_a, m);
  cc.evex().vpopcntq(zmm_a, m);
  cc.evex().vpord(xmm_a, xmm_b, m);
  cc.evex().vpord(ymm_a, ymm_b, m);
  cc.evex().vpord(zmm_a, zmm_b, m);
  cc.evex().vporq(xmm_a, xmm_b, m);
  cc.evex().vporq(ymm_a, ymm_b, m);
  cc.evex().vporq(zmm_a, zmm_b, m);
  cc.evex().vprold(xmm_a, m, 0);
  cc.evex().vprold(ymm_a, m, 0);
  cc.evex().vprold(zmm_a, m, 0);
  cc.evex().vprolq(xmm_a, m, 0);
  cc.evex().vprolq(ymm_a, m, 0);
  cc.evex().vprolq(zmm_a, m, 0);
  cc.evex().vprolvd(xmm_a, xmm_b, m);
  cc.evex().vprolvd(ymm_a, ymm_b, m);
  cc.evex().vprolvd(zmm_a, zmm_b, m);
  cc.evex().vprolvq(xmm_a, xmm_b, m);
  cc.evex().vprolvq(ymm_a, ymm_b, m);
  cc.evex().vprolvq(zmm_a, zmm_b, m);
  cc.evex().vprord(xmm_a, m, 0);
  cc.evex().vprord(ymm_a, m, 0);
  cc.evex().vprord(zmm_a, m, 0);
  cc.evex().vprorq(xmm_a, m, 0);
  cc.evex().vprorq(ymm_a, m, 0);
  cc.evex().vprorq(zmm_a, m, 0);
  cc.evex().vprorvd(xmm_a, xmm_b, m);
  cc.evex().vprorvd(ymm_a, ymm_b, m);
  cc.evex().vprorvd(zmm_a, zmm_b, m);
  cc.evex().vprorvq(xmm_a, xmm_b, m);
  cc.evex().vprorvq(ymm_a, ymm_b, m);
  cc.evex().vprorvq(zmm_a, zmm_b, m);
  cc.evex().vpsadbw(xmm_a, xmm_b, m);
  cc.evex().vpsadbw(ymm_a, ymm_b, m);
  cc.evex().vpsadbw(zmm_a, zmm_b, m);
  cc.evex().k(kA).vpscatterdd(vx_ptr, xmm_b);
  cc.evex().k(kA).vpscatterdd(vy_ptr, ymm_b);
  cc.evex().k(kA).vpscatterdd(vz_ptr, zmm_b);
  cc.evex().k(kA).vpscatterdq(vx_ptr, xmm_b);
  cc.evex().k(kA).vpscatterdq(vx_ptr, ymm_b);
  cc.evex().k(kA).vpscatterdq(vy_ptr, zmm_b);
  cc.evex().k(kA).vpscatterqd(vx_ptr, xmm_b);
  cc.evex().k(kA).vpscatterqd(vy_ptr, xmm_b);
  cc.evex().k(kA).vpscatterqd(vz_ptr, ymm_b);
  cc.evex().k(kA).vpscatterqq(vx_ptr, xmm_b);
  cc.evex().k(kA).vpscatterqq(vy_ptr, ymm_b);
  cc.evex().k(kA).vpscatterqq(vz_ptr, zmm_b);
  cc.evex().vpshufb(xmm_a, xmm_b, m);
  cc.evex().vpshufb(ymm_a, ymm_b, m);
  cc.evex().vpshufb(zmm_a, zmm_b, m);
  cc.evex().vpshufd(xmm_a, m, 0);
  cc.evex().vpshufd(ymm_a, m, 0);
  cc.evex().vpshufd(zmm_a, m, 0);
  cc.evex().vpshufhw(xmm_a, m, 0);
  cc.evex().vpshufhw(ymm_a, m, 0);
  cc.evex().vpshufhw(zmm_a, m, 0);
  cc.evex().vpshuflw(xmm_a, m, 0);
  cc.evex().vpshuflw(ymm_a, m, 0);
  cc.evex().vpshuflw(zmm_a, m, 0);
  cc.evex().vpslld(xmm_a, xmm_b, m);
  cc.evex().vpslld(xmm_a, m, 0);
  cc.evex().vpslld(ymm_a, ymm_b, m);
  cc.evex().vpslld(ymm_a, m, 0);
  cc.evex().vpslld(zmm_a, zmm_b, m);
  cc.evex().vpslld(zmm_a, m, 0);
  cc.evex().vpslldq(xmm_a, m, 0);
  cc.evex().vpslldq(ymm_a, m, 0);
  cc.evex().vpslldq(zmm_a, m, 0);
  cc.evex().vpsllq(xmm_a, xmm_b, m);
  cc.evex().vpsllq(xmm_a, m, 0);
  cc.evex().vpsllq(ymm_a, ymm_b, m);
  cc.evex().vpsllq(ymm_a, m, 0);
  cc.evex().vpsllq(zmm_a, zmm_b, m);
  cc.evex().vpsllq(zmm_a, m, 0);
  cc.evex().vpsllvd(xmm_a, xmm_b, m);
  cc.evex().vpsllvd(ymm_a, ymm_b, m);
  cc.evex().vpsllvd(zmm_a, zmm_b, m);
  cc.evex().vpsllvq(xmm_a, xmm_b, m);
  cc.evex().vpsllvq(ymm_a, ymm_b, m);
  cc.evex().vpsllvq(zmm_a, zmm_b, m);
  cc.evex().vpsllvw(xmm_a, xmm_b, m);
  cc.evex().vpsllvw(ymm_a, ymm_b, m);
  cc.evex().vpsllvw(zmm_a, zmm_b, m);
  cc.evex().vpsllw(xmm_a, xmm_b, m);
  cc.evex().vpsllw(xmm_a, m, 0);
  cc.evex().vpsllw(ymm_a, ymm_b, m);
  cc.evex().vpsllw(ymm_a, m, 0);
  cc.evex().vpsllw(zmm_a, zmm_b, m);
  cc.evex().vpsllw(zmm_a, m, 0);
  cc.evex().vpsrad(xmm_a, xmm_b, m);
  cc.evex().vpsrad(xmm_a, m, 0);
  cc.evex().vpsrad(ymm_a, ymm_b, m);
  cc.evex().vpsrad(ymm_a, m, 0);
  cc.evex().vpsrad(zmm_a, zmm_b, m);
  cc.evex().vpsrad(zmm_a, m, 0);
  cc.evex().vpsraq(xmm_a, xmm_b, m);
  cc.evex().vpsraq(xmm_a, m, 0);
  cc.evex().vpsraq(ymm_a, ymm_b, m);
  cc.evex().vpsraq(ymm_a, m, 0);
  cc.evex().vpsraq(zmm_a, zmm_b, m);
  cc.evex().vpsraq(zmm_a, m, 0);
  cc.evex().vpsravd(xmm_a, xmm_b, m);
  cc.evex().vpsravd(ymm_a, ymm_b, m);
  cc.evex().vpsravd(zmm_a, zmm_b, m);
  cc.evex().vpsravq(xmm_a, xmm_b, m);
  cc.evex().vpsravq(ymm_a, ymm_b, m);
  cc.evex().vpsravq(zmm_a, zmm_b, m);
  cc.evex().vpsravw(xmm_a, xmm_b, m);
  cc.evex().vpsravw(ymm_a, ymm_b, m);
  cc.evex().vpsravw(zmm_a, zmm_b, m);
  cc.evex().vpsraw(xmm_a, xmm_b, m);
  cc.evex().vpsraw(xmm_a, m, 0);
  cc.evex().vpsraw(ymm_a, ymm_b, m);
  cc.evex().vpsraw(ymm_a, m, 0);
  cc.evex().vpsraw(zmm_a, zmm_b, m);
  cc.evex().vpsraw(zmm_a, m, 0);
  cc.evex().vpsrld(xmm_a, xmm_b, m);
  cc.evex().vpsrld(xmm_a, m, 0);
  cc.evex().vpsrld(ymm_a, ymm_b, m);
  cc.evex().vpsrld(ymm_a, m, 0);
  cc.evex().vpsrld(zmm_a, zmm_b, m);
  cc.evex().vpsrld(zmm_a, m, 0);
  cc.evex().vpsrldq(xmm_a, m, 0);
  cc.evex().vpsrldq(ymm_a, m, 0);
  cc.evex().vpsrldq(zmm_a, m, 0);
  cc.evex().vpsrlq(xmm_a, xmm_b, m);
  cc.evex().vpsrlq(xmm_a, m, 0);
  cc.evex().vpsrlq(ymm_a, ymm_b, m);
  cc.evex().vpsrlq(ymm_a, m, 0);
  cc.evex().vpsrlq(zmm_a, zmm_b, m);
  cc.evex().vpsrlq(zmm_a, m, 0);
  cc.evex().vpsrlvd(xmm_a, xmm_b, m);
  cc.evex().vpsrlvd(ymm_a, ymm_b, m);
  cc.evex().vpsrlvd(zmm_a, zmm_b, m);
  cc.evex().vpsrlvq(xmm_a, xmm_b, m);
  cc.evex().vpsrlvq(ymm_a, ymm_b, m);
  cc.evex().vpsrlvq(zmm_a, zmm_b, m);
  cc.evex().vpsrlvw(xmm_a, xmm_b, m);
  cc.evex().vpsrlvw(ymm_a, ymm_b, m);
  cc.evex().vpsrlvw(zmm_a, zmm_b, m);
  cc.evex().vpsrlw(xmm_a, xmm_b, m);
  cc.evex().vpsrlw(xmm_a, m, 0);
  cc.evex().vpsrlw(ymm_a, ymm_b, m);
  cc.evex().vpsrlw(ymm_a, m, 0);
  cc.evex().vpsrlw(zmm_a, zmm_b, m);
  cc.evex().vpsrlw(zmm_a, m, 0);
  cc.evex().vpsubb(xmm_a, xmm_b, m);
  cc.evex().vpsubb(ymm_a, ymm_b, m);
  cc.evex().vpsubb(zmm_a, zmm_b, m);
  cc.evex().vpsubd(xmm_a, xmm_b, m);
  cc.evex().vpsubd(ymm_a, ymm_b, m);
  cc.evex().vpsubd(zmm_a, zmm_b, m);
  cc.evex().vpsubq(xmm_a, xmm_b, m);
  cc.evex().vpsubq(ymm_a, ymm_b, m);
  cc.evex().vpsubq(zmm_a, zmm_b, m);
  cc.evex().vpsubsb(xmm_a, xmm_b, m);
  cc.evex().vpsubsb(ymm_a, ymm_b, m);
  cc.evex().vpsubsb(zmm_a, zmm_b, m);
  cc.evex().vpsubsw(xmm_a, xmm_b, m);
  cc.evex().vpsubsw(ymm_a, ymm_b, m);
  cc.evex().vpsubsw(zmm_a, zmm_b, m);
  cc.evex().vpsubusb(xmm_a, xmm_b, m);
  cc.evex().vpsubusb(ymm_a, ymm_b, m);
  cc.evex().vpsubusb(zmm_a, zmm_b, m);
  cc.evex().vpsubusw(xmm_a, xmm_b, m);
  cc.evex().vpsubusw(ymm_a, ymm_b, m);
  cc.evex().vpsubusw(zmm_a, zmm_b, m);
  cc.evex().vpsubw(xmm_a, xmm_b, m);
  cc.evex().vpsubw(ymm_a, ymm_b, m);
  cc.evex().vpsubw(zmm_a, zmm_b, m);
  cc.evex().vpternlogd(xmm_a, xmm_b, m, 0);
  cc.evex().vpternlogd(ymm_a, ymm_b, m, 0);
  cc.evex().vpternlogd(zmm_a, zmm_b, m, 0);
  cc.evex().vpternlogq(xmm_a, xmm_b, m, 0);
  cc.evex().vpternlogq(ymm_a, ymm_b, m, 0);
  cc.evex().vpternlogq(zmm_a, zmm_b, m, 0);
  cc.evex().vptestmb(kA, xmm_b, m);
  cc.evex().vptestmb(kA, ymm_b, m);
  cc.evex().vptestmb(kA, zmm_b, m);
  cc.evex().vptestmd(kA, xmm_b, m);
  cc.evex().vptestmd(kA, ymm_b, m);
  cc.evex().vptestmd(kA, zmm_b, m);
  cc.evex().vptestmq(kA, xmm_b, m);
  cc.evex().vptestmq(kA, ymm_b, m);
  cc.evex().vptestmq(kA, zmm_b, m);
  cc.evex().vptestmw(kA, xmm_b, m);
  cc.evex().vptestmw(kA, ymm_b, m);
  cc.evex().vptestmw(kA, zmm_b, m);
  cc.evex().vptestnmb(kA, xmm_b, m);
  cc.evex().vptestnmb(kA, ymm_b, m);
  cc.evex().vptestnmb(kA, zmm_b, m);
  cc.evex().vptestnmd(kA, xmm_b, m);
  cc.evex().vptestnmd(kA, ymm_b, m);
  cc.evex().vptestnmd(kA, zmm_b, m);
  cc.evex().vptestnmq(kA, xmm_b, m);
  cc.evex().vptestnmq(kA, ymm_b, m);
  cc.evex().vptestnmq(kA, zmm_b, m);
  cc.evex().vptestnmw(kA, xmm_b, m);
  cc.evex().vptestnmw(kA, ymm_b, m);
  cc.evex().vptestnmw(kA, zmm_b, m);
  cc.evex().vpunpckhbw(xmm_a, xmm_b, m);
  cc.evex().vpunpckhbw(ymm_a, ymm_b, m);
  cc.evex().vpunpckhbw(zmm_a, zmm_b, m);
  cc.evex().vpunpckhdq(xmm_a, xmm_b, m);
  cc.evex().vpunpckhdq(ymm_a, ymm_b, m);
  cc.evex().vpunpckhdq(zmm_a, zmm_b, m);
  cc.evex().vpunpckhqdq(xmm_a, xmm_b, m);
  cc.evex().vpunpckhqdq(ymm_a, ymm_b, m);
  cc.evex().vpunpckhqdq(zmm_a, zmm_b, m);
  cc.evex().vpunpckhwd(xmm_a, xmm_b, m);
  cc.evex().vpunpckhwd(ymm_a, ymm_b, m);
  cc.evex().vpunpckhwd(zmm_a, zmm_b, m);
  cc.evex().vpunpcklbw(xmm_a, xmm_b, m);
  cc.evex().vpunpcklbw(ymm_a, ymm_b, m);
  cc.evex().vpunpcklbw(zmm_a, zmm_b, m);
  cc.evex().vpunpckldq(xmm_a, xmm_b, m);
  cc.evex().vpunpckldq(ymm_a, ymm_b, m);
  cc.evex().vpunpckldq(zmm_a, zmm_b, m);
  cc.evex().vpunpcklqdq(xmm_a, xmm_b, m);
  cc.evex().vpunpcklqdq(ymm_a, ymm_b, m);
  cc.evex().vpunpcklqdq(zmm_a, zmm_b, m);
  cc.evex().vpunpcklwd(xmm_a, xmm_b, m);
  cc.evex().vpunpcklwd(ymm_a, ymm_b, m);
  cc.evex().vpunpcklwd(zmm_a, zmm_b, m);
  cc.evex().vpxord(xmm_a, xmm_b, m);
  cc.evex().vpxord(ymm_a, ymm_b, m);
  cc.evex().vpxord(zmm_a, zmm_b, m);
  cc.evex().vpxorq(xmm_a, xmm_b, m);
  cc.evex().vpxorq(ymm_a, ymm_b, m);
  cc.evex().vpxorq(zmm_a, zmm_b, m);
  cc.evex().vrangepd(xmm_a, xmm_b, m, 0);
  cc.evex().vrangepd(ymm_a, ymm_b, m, 0);
  cc.evex().vrangepd(zmm_a, zmm_b, m, 0);
  cc.evex().vrangeps(xmm_a, xmm_b, m, 0);
  cc.evex().vrangeps(ymm_a, ymm_b, m, 0);
  cc.evex().vrangeps(zmm_a, zmm_b, m, 0);
  cc.evex().vrangesd(xmm_a, xmm_b, m, 0);
  cc.evex().vrangess(xmm_a, xmm_b, m, 0);
  cc.evex().vrcp14pd(xmm_a, m);
  cc.evex().vrcp14pd(ymm_a, m);
  cc.evex().vrcp14pd(zmm_a, m);
  cc.evex().vrcp14ps(xmm_a, m);
  cc.evex().vrcp14ps(ymm_a, m);
  cc.evex().vrcp14ps(zmm_a, m);
  cc.evex().vrcp14sd(xmm_a, xmm_b, m);
  cc.evex().vrcp14ss(xmm_a, xmm_b, m);
  cc.evex().vreducepd(xmm_a, m, 0);
  cc.evex().vreducepd(ymm_a, m, 0);
  cc.evex().vreducepd(zmm_a, m, 0);
  cc.evex().vreduceps(xmm_a, m, 0);
  cc.evex().vreduceps(ymm_a, m, 0);
  cc.evex().vreduceps(zmm_a, m, 0);
  cc.evex().vreducesd(xmm_a, xmm_b, m, 0);
  cc.evex().vreducess(xmm_a, xmm_b, m, 0);
  cc.evex().vrndscalepd(xmm_a, m, 0);
  cc.evex().vrndscalepd(ymm_a, m, 0);
  cc.evex().vrndscalepd(zmm_a, m, 0);
  cc.evex().vrndscaleps(xmm_a, m, 0);
  cc.evex().vrndscaleps(ymm_a, m, 0);
  cc.evex().vrndscaleps(zmm_a, m, 0);
  cc.evex().vrndscalesd(xmm_a, xmm_b, m, 0);
  cc.evex().vrndscaless(xmm_a, xmm_b, m, 0);
  cc.evex().vrsqrt14pd(xmm_a, m);
  cc.evex().vrsqrt14pd(ymm_a, m);
  cc.evex().vrsqrt14pd(zmm_a, m);
  cc.evex().vrsqrt14ps(xmm_a, m);
  cc.evex().vrsqrt14ps(ymm_a, m);
  cc.evex().vrsqrt14ps(zmm_a, m);
  cc.evex().vrsqrt14sd(xmm_a, xmm_b, m);
  cc.evex().vrsqrt14ss(xmm_a, xmm_b, m);
  cc.evex().vscalefpd(xmm_a, xmm_b, m);
  cc.evex().vscalefpd(ymm_a, ymm_b, m);
  cc.evex().vscalefpd(zmm_a, zmm_b, m);
  cc.evex().vscalefps(xmm_a, xmm_b, m);
  cc.evex().vscalefps(ymm_a, ymm_b, m);
  cc.evex().vscalefps(zmm_a, zmm_b, m);
  cc.evex().vscalefsd(xmm_a, xmm_b, m);
  cc.evex().vscalefss(xmm_a, xmm_b, m);
  cc.evex().k(kA).vscatterdpd(vx_ptr, xmm_b);
  cc.evex().k(kA).vscatterdpd(vx_ptr, ymm_b);
  cc.evex().k(kA).vscatterdpd(vy_ptr, zmm_b);
  cc.evex().k(kA).vscatterdps(vx_ptr, xmm_b);
  cc.evex().k(kA).vscatterdps(vy_ptr, ymm_b);
  cc.evex().k(kA).vscatterdps(vz_ptr, zmm_b);
  cc.evex().k(kA).vscatterqpd(vx_ptr, xmm_b);
  cc.evex().k(kA).vscatterqpd(vy_ptr, ymm_b);
  cc.evex().k(kA).vscatterqpd(vz_ptr, zmm_b);
  cc.evex().k(kA).vscatterqps(vx_ptr, xmm_b);
  cc.evex().k(kA).vscatterqps(vy_ptr, xmm_b);
  cc.evex().k(kA).vscatterqps(vz_ptr, ymm_b);
  cc.evex().vshuff32x4(ymm_a, ymm_b, m, 0);
  cc.evex().vshuff32x4(zmm_a, zmm_b, m, 0);
  cc.evex().vshuff64x2(ymm_a, ymm_b, m, 0);
  cc.evex().vshuff64x2(zmm_a, zmm_b, m, 0);
  cc.evex().vshufi32x4(ymm_a, ymm_b, m, 0);
  cc.evex().vshufi32x4(zmm_a, zmm_b, m, 0);
  cc.evex().vshufi64x2(ymm_a, ymm_b, m, 0);
  cc.evex().vshufi64x2(zmm_a, zmm_b, m, 0);
  cc.evex().vshufpd(xmm_a, xmm_b, m, 0);
  cc.evex().vshufpd(ymm_a, ymm_b, m, 0);
  cc.evex().vshufpd(zmm_a, zmm_b, m, 0);
  cc.evex().vshufps(xmm_a, xmm_b, m, 0);
  cc.evex().vshufps(ymm_a, ymm_b, m, 0);
  cc.evex().vshufps(zmm_a, zmm_b, m, 0);
  cc.evex().vsqrtpd(xmm_a, m);
  cc.evex().vsqrtpd(ymm_a, m);
  cc.evex().vsqrtpd(zmm_a, m);
  cc.evex().vsqrtps(xmm_a, m);
  cc.evex().vsqrtps(ymm_a, m);
  cc.evex().vsqrtps(zmm_a, m);
  cc.evex().vsqrtsd(xmm_a, xmm_b, m);
  cc.evex().vsqrtss(xmm_a, xmm_b, m);
  cc.evex().vsubpd(xmm_a, xmm_b, m);
  cc.evex().vsubpd(ymm_a, ymm_b, m);
  cc.evex().vsubpd(zmm_a, zmm_b, m);
  cc.evex().vsubps(xmm_a, xmm_b, m);
  cc.evex().vsubps(ymm_a, ymm_b, m);
  cc.evex().vsubps(zmm_a, zmm_b, m);
  cc.evex().vsubsd(xmm_a, xmm_b, m);
  cc.evex().vsubss(xmm_a, xmm_b, m);
  cc.evex().vucomisd(xmm_a, m);
  cc.evex().vucomiss(xmm_a, m);
  cc.evex().vunpckhpd(xmm_a, xmm_b, m);
  cc.evex().vunpckhpd(ymm_a, ymm_b, m);
  cc.evex().vunpckhpd(zmm_a, zmm_b, m);
  cc.evex().vunpckhps(xmm_a, xmm_b, m);
  cc.evex().vunpckhps(ymm_a, ymm_b, m);
  cc.evex().vunpckhps(zmm_a, zmm_b, m);
  cc.evex().vunpcklpd(xmm_a, xmm_b, m);
  cc.evex().vunpcklpd(ymm_a, ymm_b, m);
  cc.evex().vunpcklpd(zmm_a, zmm_b, m);
  cc.evex().vunpcklps(xmm_a, xmm_b, m);
  cc.evex().vunpcklps(ymm_a, ymm_b, m);
  cc.evex().vunpcklps(zmm_a, zmm_b, m);
  cc.evex().vxorpd(xmm_a, xmm_b, m);
  cc.evex().vxorpd(ymm_a, ymm_b, m);
  cc.evex().vxorpd(zmm_a, zmm_b, m);
  cc.evex().vxorps(xmm_a, xmm_b, m);
  cc.evex().vxorps(ymm_a, ymm_b, m);
  cc.evex().vxorps(zmm_a, zmm_b, m);
}

// Generates a long sequence of AVX512 instructions.
template<typename Emitter>
static void generate_avx512_sequence_internal(
  Emitter& cc,
  InstForm form,
  const x86::Gp& gp,
  const x86::KReg& kA, const x86::KReg& kB, const x86::KReg& kC,
  const x86::Vec& vec_a, const x86::Vec& vec_b, const x86::Vec& vec_c, const x86::Vec& vec_d) {

  if (form == InstForm::kReg)
    generate_avx512_sequence_internal_reg_only(cc, gp, kA, kB, kC, vec_a, vec_b, vec_c, vec_d);
  else
    generate_avx512_sequence_internal_reg_mem(cc, gp, kA, kB, kC, vec_a, vec_b, vec_c, vec_d);
}

static void generate_avx512_sequence(BaseEmitter& emitter, InstForm form, bool emit_prolog_epilog) {
  using namespace asmjit::x86;

#ifndef ASMJIT_NO_COMPILER
  if (emitter.is_compiler()) {
    Compiler& cc = *emitter.as<Compiler>();

    Gp gp = cc.new_gpz("gp");
    Vec vec_a = cc.new_zmm("vec_a");
    Vec vec_b = cc.new_zmm("vec_b");
    Vec vec_c = cc.new_zmm("vec_c");
    Vec vec_d = cc.new_zmm("vec_d");

    KReg kA = cc.new_kq("kA");
    KReg kB = cc.new_kq("kB");
    KReg kC = cc.new_kq("kC");

    cc.add_func(FuncSignature::build<void>());
    generate_avx512_sequence_internal(cc, form, gp, kA, kB, kC, vec_a, vec_b, vec_c, vec_d);
    cc.end_func();

    return;
  }
#endif

#ifndef ASMJIT_NO_BUILDER
  if (emitter.is_builder()) {
    Builder& cc = *emitter.as<Builder>();

    if (emit_prolog_epilog) {
      FuncDetail func;
      func.init(FuncSignature::build<void, void*, const void*, size_t>(), cc.environment());

      FuncFrame frame;
      frame.init(func);
      frame.add_dirty_regs(eax, k1, k2, k3, zmm0, zmm1, zmm2, zmm3);
      frame.finalize();

      cc.emit_prolog(frame);
      generate_avx512_sequence_internal(cc, form, eax, k1, k2, k3, zmm0, zmm1, zmm2, zmm3);
      cc.emit_epilog(frame);
    }
    else {
      generate_avx512_sequence_internal(cc, form, eax, k1, k2, k3, zmm0, zmm1, zmm2, zmm3);
    }

    return;
  }
#endif

  if (emitter.is_assembler()) {
    Assembler& cc = *emitter.as<Assembler>();

    if (emit_prolog_epilog) {
      FuncDetail func;
      func.init(FuncSignature::build<void, void*, const void*, size_t>(), cc.environment());

      FuncFrame frame;
      frame.init(func);
      frame.add_dirty_regs(eax, k1, k2, k3, zmm0, zmm1, zmm2, zmm3);
      frame.finalize();

      cc.emit_prolog(frame);
      generate_avx512_sequence_internal(cc, form, eax, k1, k2, k3, zmm0, zmm1, zmm2, zmm3);
      cc.emit_epilog(frame);
    }
    else {
      generate_avx512_sequence_internal(cc, form, eax, k1, k2, k3, zmm0, zmm1, zmm2, zmm3);
    }

    return;
  }
}

template<typename EmitterFn>
static void benchmark_x86_function(Arch arch, uint32_t num_iterations, const char* description, const EmitterFn& emitter_fn) noexcept {
  CodeHolder code;
  printf("%s:\n", description);

  uint32_t instruction_count = 0;

#ifndef ASMJIT_NO_BUILDER
  instruction_count = asmjit_perf_utils::calculate_instruction_count<x86::Builder>(code, arch, [&](x86::Builder& cc) {
    emitter_fn(cc, false);
  });
#endif

  asmjit_perf_utils::bench<x86::Assembler>(code, arch, num_iterations, "[raw]", instruction_count, [&](x86::Assembler& cc) {
    emitter_fn(cc, false);
  });

  asmjit_perf_utils::bench<x86::Assembler>(code, arch, num_iterations, "[validated]", instruction_count, [&](x86::Assembler& cc) {
    cc.add_diagnostic_options(DiagnosticOptions::kValidateAssembler);
    emitter_fn(cc, false);
  });

  asmjit_perf_utils::bench<x86::Assembler>(code, arch, num_iterations, "[prolog/epilog]", instruction_count, [&](x86::Assembler& cc) {
    emitter_fn(cc, true);
  });

#ifndef ASMJIT_NO_BUILDER
  asmjit_perf_utils::bench<x86::Builder>(code, arch, num_iterations, "[no-asm]", instruction_count, [&](x86::Builder& cc) {
    emitter_fn(cc, false);
  });

  asmjit_perf_utils::bench<x86::Builder>(code, arch, num_iterations, "[finalized]", instruction_count, [&](x86::Builder& cc) {
    emitter_fn(cc, false);
    cc.finalize();
  });

  asmjit_perf_utils::bench<x86::Builder>(code, arch, num_iterations, "[prolog/epilog]", instruction_count, [&](x86::Builder& cc) {
    emitter_fn(cc, true);
    cc.finalize();
  });
#endif

#ifndef ASMJIT_NO_COMPILER

  asmjit_perf_utils::bench<x86::Compiler>(code, arch, num_iterations, "[no-asm]", instruction_count, [&](x86::Compiler& cc) {
    emitter_fn(cc, true);
  });

  asmjit_perf_utils::bench<x86::Compiler>(code, arch, num_iterations, "[finalized]", instruction_count, [&](x86::Compiler& cc) {
    emitter_fn(cc, true);
    cc.finalize();
  });
#endif

  printf("\n");
}

void benchmark_x86_emitters(uint32_t num_iterations, bool test_x86, bool test_x64) {
  uint32_t i = 0;
  uint32_t n = 0;

  Arch archs[2] {};

  if (test_x86) archs[n++] = Arch::kX86;
  if (test_x64) archs[n++] = Arch::kX64;

  for (i = 0; i < n; i++) {
    static const char description[] = "Empty function (mov + return from function)";
    benchmark_x86_function(archs[i], num_iterations, description, [](BaseEmitter& emitter, bool emit_prolog_epilog) {
      generate_empty_function(emitter, emit_prolog_epilog);
    });
  }

  for (i = 0; i < n; i++) {
    static const char description[] = "4-Ops sequence (4 ops + return from function)";
    benchmark_x86_function(archs[i], num_iterations, description, [](BaseEmitter& emitter, bool emit_prolog_epilog) {
      generate_n_ops_sequence(emitter, 4, emit_prolog_epilog);
    });
  }

  for (i = 0; i < n; i++) {
    static const char description[] = "16-Ops sequence (16 ops + return from function)";
    benchmark_x86_function(archs[i], num_iterations, description, [](BaseEmitter& emitter, bool emit_prolog_epilog) {
      generate_n_ops_sequence(emitter, 16, emit_prolog_epilog);
    });
  }

  for (i = 0; i < n; i++) {
    static const char description[] = "32-Ops sequence (32 ops + return from function)";
    benchmark_x86_function(archs[i], num_iterations, description, [](BaseEmitter& emitter, bool emit_prolog_epilog) {
      generate_n_ops_sequence(emitter, 32, emit_prolog_epilog);
    });
  }

  for (i = 0; i < n; i++) {
    static const char description[] = "64-Ops sequence (64 ops + return from function)";
    benchmark_x86_function(archs[i], num_iterations, description, [](BaseEmitter& emitter, bool emit_prolog_epilog) {
      generate_n_ops_sequence(emitter, 64, emit_prolog_epilog);
    });
  }

  for (i = 0; i < n; i++) {
    static const char description[] = "GpSequence<Reg> (Sequence of GP instructions - reg-only)";
    benchmark_x86_function(archs[i], num_iterations, description, [](BaseEmitter& emitter, bool emit_prolog_epilog) {
      generate_gp_sequence(emitter, InstForm::kReg, emit_prolog_epilog);
    });
  }

  for (i = 0; i < n; i++) {
    static const char description[] = "GpSequence<Mem> (Sequence of GP instructions - reg/mem)";
    benchmark_x86_function(archs[i], num_iterations, description, [](BaseEmitter& emitter, bool emit_prolog_epilog) {
      generate_gp_sequence(emitter, InstForm::kMem, emit_prolog_epilog);
    });
  }

  for (i = 0; i < n; i++) {
    static const char description[] = "SseSequence<Reg> (sequence of SSE+ instructions - reg-only)";
    benchmark_x86_function(archs[i], num_iterations, description, [](BaseEmitter& emitter, bool emit_prolog_epilog) {
      generate_sse_sequence(emitter, InstForm::kReg, emit_prolog_epilog);
    });
  }

  for (i = 0; i < n; i++) {
    static const char description[] = "SseSequence<Mem> (sequence of SSE+ instructions - reg/mem)";
    benchmark_x86_function(archs[i], num_iterations, description, [](BaseEmitter& emitter, bool emit_prolog_epilog) {
      generate_sse_sequence(emitter, InstForm::kMem, emit_prolog_epilog);
    });
  }

  for (i = 0; i < n; i++) {
    static const char description[] = "AvxSequence<Reg> (sequence of AVX+ instructions - reg-only)";
    benchmark_x86_function(archs[i], num_iterations, description, [](BaseEmitter& emitter, bool emit_prolog_epilog) {
      generate_avx_sequence(emitter, InstForm::kReg, emit_prolog_epilog);
    });
  }

  for (i = 0; i < n; i++) {
    static const char description[] = "AvxSequence<Mem> (sequence of AVX+ instructions - reg/mem)";
    benchmark_x86_function(archs[i], num_iterations, description, [](BaseEmitter& emitter, bool emit_prolog_epilog) {
      generate_avx_sequence(emitter, InstForm::kMem, emit_prolog_epilog);
    });
  }

  for (i = 0; i < n; i++) {
    static const char description[] = "Avx512Sequence<Reg> (sequence of AVX512+ instructions - reg-only)";
    benchmark_x86_function(archs[i], num_iterations, description, [](BaseEmitter& emitter, bool emit_prolog_epilog) {
      generate_avx512_sequence(emitter, InstForm::kReg, emit_prolog_epilog);
    });
  }

  for (i = 0; i < n; i++) {
    static const char description[] = "Avx512Sequence<Mem> (sequence of AVX512+ instructions - reg/mem)";
    benchmark_x86_function(archs[i], num_iterations, description, [](BaseEmitter& emitter, bool emit_prolog_epilog) {
      generate_avx512_sequence(emitter, InstForm::kMem, emit_prolog_epilog);
    });
  }

  for (i = 0; i < n; i++) {
    static const char description[] = "SseAlphaBlend (alpha-blend function with labels and jumps)";
    benchmark_x86_function(archs[i], num_iterations, description, [](BaseEmitter& emitter, bool emit_prolog_epilog) {
      asmtest::generate_sse_alpha_blend(emitter, emit_prolog_epilog);
    });
  }
}

#endif // !ASMJIT_NO_X86
